loadPkg = function(x) { if (!require(x,character.only=T, quietly =T)) { install.packages(x,dep=T,repos="http://cran.us.r-project.org"); if(!require(x,character.only=T)) stop("Package not found") } }
#load some packages that will be used
loadPkg("randomForest")
loadPkg("DMwR")
loadPkg("ggplot2")
loadPkg("Metrics")
loadPkg("FNN")
loadPkg("leaps")
loadPkg("ISLR")
loadPkg("tree")
loadPkg("rpart")
loadPkg("rpart.plot")
loadPkg("rattle") # For fancyRpartPlot (Trees) Answer "no" on installing from binary source
loadPkg("corrplot")
Before importing the data, we preprocessed the raw data in Python to obtain a nicer data frame as the raw data contains some columns written in JSON format with many attributes.
# read csv file
df <- read.csv("Movie.csv")
str(df) # have a glance at the data by seeing its structure
## 'data.frame': 4803 obs. of 12 variables:
## $ X : int 0 1 2 3 4 5 6 7 8 9 ...
## $ budget : int 237000000 300000000 245000000 250000000 260000000 258000000 260000000 280000000 250000000 250000000 ...
## $ genres : Factor w/ 21 levels "","Action","Adventure",..: 2 3 2 2 2 10 4 2 3 2 ...
## $ popularity : num 150.4 139.1 107.4 112.3 43.9 ...
## $ production_companies: Factor w/ 1314 levels "","100 Bares",..: 615 1263 265 696 1263 265 1263 758 1267 320 ...
## $ release_date : Factor w/ 3281 levels "","1916-09-04",..: 2315 1945 3185 2688 2635 1940 2450 3111 2246 3234 ...
## $ revenue : num 2.79e+09 9.61e+08 8.81e+08 1.08e+09 2.84e+08 ...
## $ runtime : num 162 169 148 165 132 139 100 141 153 151 ...
## $ title : Factor w/ 4800 levels "(500) Days of Summer",..: 381 2653 3186 3614 1906 3198 3364 382 1587 444 ...
## $ vote_average : num 7.2 6.9 6.3 7.6 6.1 5.9 7.4 7.3 7.4 5.7 ...
## $ vote_count : int 11800 4500 4466 9106 2124 3576 3330 6767 5293 7004 ...
## $ Number_Genres : int 4 3 3 4 3 3 2 3 3 3 ...
#remove na
movie = na.omit(df)
#remove duplicates
movie <- unique(movie)
#remove movies wiht no budget, no revenue, no genre, no vote, no score
#we can build some models and use predict function to fill the missing values, but there are not many missing values and we do not know which models are good for prediction so we just remove them
movie <- subset(movie, !((movie$budget == 0) | (movie$revenue == 0) | (movie$Number_Genres == 0) | (movie$vote_count == 0)
| (movie$vote_average == 0) ))
#str(movie)
# rename column and create profit columns
colnames(movie)[c(5,6,10,11,12)] = c("company","date","score","vote","genrecount") # rename some columns
movie$profit = movie$revenue - movie$budget # create profit column
movie$profitable <- c(0) # create profitable column which indicates whether a movie has negative or posite profit
for (i in 1:nrow(movie)) {if (movie[i,"profit"] > 0) {movie[i,"profitable"] = 1} } # positive profit is labeled as 1, negative is 0
movie$profitable = as.factor(movie$profitable) # make sure the profitable outputs are factors
# preprocess company column
movie$company = as.character(movie$company) # change company to character for easier operation
# Below is the list of studios in 5 largest parent studios in the world.
# I collect information from wiki and some websites
# Although the lists may lack some minor studios, they include the majority of popular ones.
warner = c("Warner Bros.", "Warner Bros. Pictures", "Warner Bros. Animation", "Warner Bros. Family Entertainment", "Warner Brothers/Seven Arts", "New Line Cinema", "DC Comics","Castle Rock Entertainment")
universal = c("Universal","Universal Pictures", "Universal Pictures Corporation", "Universal Studios", "DreamWorks", "DreamWorks Animation", "DreamWorks SKG", "Amblin Entertainment", "Working Title Films", "Focus Features", "Focus Films","Gramercy Pictures")
paramount = c("Paramount Pictures", "Paramount", "Paramount Classics", "Paramount Vantage","MTV Films","Nickelodeon Movies")
walt_disney = c("Walt Disney", "Walt Disney Animation Studios", "Walt Disney Pictures", "Walt Disney Productions", "Walt Disney Studios Motion Pictures", "Walt Disney Television Animation", "Buena Vista", "Touchstone Pictures", "Hollywood Pictures", "Caravan Pictures","Miramax","Miramax Films","Dimension Films","Marvel Studios", "Twentieth Century Fox Film Corporation", "Blue Sky Studios","Lucasfilm","Fox 2000 Pictures", "Fox Atomic","Fox Entertainment Group","Fox Searchlight Pictures","UTV Motion Pictures")
sony = c("Columbia Pictures","Columbia Pictures Corporation", "Columnbia Pictures Industries", "Sony Pictures", "Sony Pictures Animation", "Sony Pictures Classics", "TriStar Pictures","Destination Films")
#other film companies will be saved into "Others" group
# assign new values to the company column using the list of 5 largerst movie companies above
for (i in 1:nrow(movie)) {
if (movie[i,"company"]%in%warner) {movie[i,"company"] = "Warner Bros"}
else if(movie[i,"company"]%in%universal) {movie[i,"company"] = "Universal Pictures"}
else if(movie[i,"company"]%in%paramount) {movie[i,"company"] = "Paramount Pictures"}
else if(movie[i,"company"]%in%walt_disney) {movie[i,"company"] = "Walt Disney"}
else if(movie[i,"company"]%in%sony) {movie[i,"company"] = "Sony Pictures"}
else (movie[i,"company"] = "Others") # movies which are not product of 5 biggest movie companies are saved as "Others"
}
movie$company = as.factor(movie$company) # change company column back to factor type
genre_count = data.frame(table(movie$genres)) # count number of movies in each genre
#genre_count # seeing the results
# there is one record for Foreign type, we will remove it so that when we split train and test sets we do not have to deal with the levels of factor in each set. Foreign is an unpopular genre and there is only one record so it won't affect our model and analysis
movie <- subset(movie, ! (movie$genres == "Foreign")) # subset movie excluding Foreign
row.names(movie) <- NULL #reorder row names since after removing rows, the order is not natural
movie$genres <- factor(as.character(movie$genres)) # reasign the levels of factor for genres
movie$title <- factor(movie$title) # reorder title factor levels
# conver date column to date formate as Y-m-d
loadPkg("lubridate")
movie$date <- as.character(movie$date) # change the date column to character
movie$date <- as.Date(movie$date, format = "%Y-%m-%d") # save it as date format
str(movie) # check the structure now
## 'data.frame': 3225 obs. of 14 variables:
## $ X : int 0 1 2 3 4 5 6 7 8 9 ...
## $ budget : int 237000000 300000000 245000000 250000000 260000000 258000000 260000000 280000000 250000000 250000000 ...
## $ genres : Factor w/ 18 levels "Action","Adventure",..: 1 2 1 1 1 9 3 1 2 1 ...
## $ popularity: num 150.4 139.1 107.4 112.3 43.9 ...
## $ company : Factor w/ 6 levels "Others","Paramount Pictures",..: 1 5 3 1 5 3 5 5 6 6 ...
## $ date : Date, format: "2009-12-10" "2007-05-19" ...
## $ revenue : num 2.79e+09 9.61e+08 8.81e+08 1.08e+09 2.84e+08 ...
## $ runtime : num 162 169 148 165 132 139 100 141 153 151 ...
## $ title : Factor w/ 3224 levels "(500) Days of Summer",..: 259 1761 2129 2420 1265 2139 2256 260 1053 310 ...
## $ score : num 7.2 6.9 6.3 7.6 6.1 5.9 7.4 7.3 7.4 5.7 ...
## $ vote : int 11800 4500 4466 9106 2124 3576 3330 6767 5293 7004 ...
## $ genrecount: int 4 3 3 4 3 3 2 3 3 3 ...
## $ profit : num 2.55e+09 6.61e+08 6.36e+08 8.35e+08 2.41e+07 ...
## $ profitable: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
# create a season column, at first, we save month data to it
movie$season <- month(movie$date)
movies <- movie
colnames(movies)[c(15)] <- c('month')
movie$quarter <- c(1) # create a quarter column which will be used for time series model
# apply quarters
for (i in 1:nrow(movie)) {
if (movie[i,"season"]%in%c(1,2,3)) {movie[i,"quarter"] = "Q1"} # Quarter 1 for Jan, Feb, Mar
else if(movie[i,"season"]%in%c(4,5,6)) {movie[i,"quarter"] = "Q2"} # Quarter 2 for April, May, June
else if(movie[i,"season"]%in%c(7,8,9)) {movie[i,"quarter"] = "Q3"} # Quarter 3 for July, Aug, Sep
else {movie[i,"quarter"] = "Q4"} # Quarter 4 for Oct, Nov and Dec
}
# now apply season following meteorological definition
for (i in 1:nrow(movie)) {
if (movie[i,"season"]%in%c(3,4,5)) {movie[i,"season"] = "Spring"}
else if(movie[i,"season"]%in%c(6,7,8)) {movie[i,"season"] = "Summer"}
else if(movie[i,"season"]%in%c(9,10,11)) {movie[i,"season"] = "Fall"}
else {movie[i,"season"] = "Winter"}
}
movie$season <- as.factor(movie$season) # change the season column to factor
movie$season <- factor(movie$season, levels = c("Spring", "Summer","Fall","Winter")) # reorder factor levels
movie$quarter <- as.factor(movie$quarter) # change quarter column to factor
movie$year <- year(movie$date) # create a year column which will be used for time series
# we won't use the genrecount column and the first column X that labeled the number of movie
# we have that X column as default index while cleaning the data using Python
# now, remove X column and genrecount column which are not necessary
movie <- movie[-c(1,12)]
str(movie) # check the structure of our data after cleaning
## 'data.frame': 3225 obs. of 15 variables:
## $ budget : int 237000000 300000000 245000000 250000000 260000000 258000000 260000000 280000000 250000000 250000000 ...
## $ genres : Factor w/ 18 levels "Action","Adventure",..: 1 2 1 1 1 9 3 1 2 1 ...
## $ popularity: num 150.4 139.1 107.4 112.3 43.9 ...
## $ company : Factor w/ 6 levels "Others","Paramount Pictures",..: 1 5 3 1 5 3 5 5 6 6 ...
## $ date : Date, format: "2009-12-10" "2007-05-19" ...
## $ revenue : num 2.79e+09 9.61e+08 8.81e+08 1.08e+09 2.84e+08 ...
## $ runtime : num 162 169 148 165 132 139 100 141 153 151 ...
## $ title : Factor w/ 3224 levels "(500) Days of Summer",..: 259 1761 2129 2420 1265 2139 2256 260 1053 310 ...
## $ score : num 7.2 6.9 6.3 7.6 6.1 5.9 7.4 7.3 7.4 5.7 ...
## $ vote : int 11800 4500 4466 9106 2124 3576 3330 6767 5293 7004 ...
## $ profit : num 2.55e+09 6.61e+08 6.36e+08 8.35e+08 2.41e+07 ...
## $ profitable: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ season : Factor w/ 4 levels "Spring","Summer",..: 4 1 3 2 1 1 3 1 2 1 ...
## $ quarter : Factor w/ 4 levels "Q1","Q2","Q3",..: 4 2 4 3 1 2 4 2 3 1 ...
## $ year : num 2009 2007 2015 2012 2012 ...
summary(movie) # summary of the whole data
## budget genres popularity
## Min. :1.00e+00 Drama :745 Min. : 0
## 1st Qu.:1.05e+07 Comedy :634 1st Qu.: 10
## Median :2.50e+07 Action :588 Median : 20
## Mean :4.07e+07 Adventure:288 Mean : 29
## 3rd Qu.:5.50e+07 Horror :197 3rd Qu.: 37
## Max. :3.80e+08 Crime :141 Max. :876
## (Other) :632
## company date revenue
## Others :1636 Min. :1916-09-04 Min. :5.00e+00
## Paramount Pictures: 255 1st Qu.:1998-09-10 1st Qu.:1.71e+07
## Sony Pictures : 277 Median :2005-07-20 Median :5.52e+07
## Universal Pictures: 338 Mean :2002-03-18 Mean :1.21e+08
## Walt Disney : 497 3rd Qu.:2010-11-11 3rd Qu.:1.46e+08
## Warner Bros : 222 Max. :2016-09-09 Max. :2.79e+09
##
## runtime title score
## Min. : 41 The Host : 2 Min. :2.30
## 1st Qu.: 96 (500) Days of Summer : 1 1st Qu.:5.80
## Median :107 [REC] : 1 Median :6.30
## Mean :111 [REC]² : 1 Mean :6.31
## 3rd Qu.:121 10 Cloverfield Lane : 1 3rd Qu.:6.90
## Max. :338 10 Things I Hate About You: 1 Max. :8.50
## (Other) :3218
## vote profit profitable season quarter
## Min. : 1 Min. :-1.66e+08 0: 787 Spring:704 Q1:656
## 1st Qu.: 179 1st Qu.: 2.52e+05 1:2438 Summer:837 Q2:757
## Median : 471 Median : 2.64e+07 Fall :930 Q3:931
## Mean : 978 Mean : 8.07e+07 Winter:754 Q4:881
## 3rd Qu.: 1148 3rd Qu.: 9.75e+07
## Max. :13752 Max. : 2.55e+09
##
## year
## Min. :1916
## 1st Qu.:1998
## Median :2005
## Mean :2002
## 3rd Qu.:2010
## Max. :2016
##
We need to scale the data, too high difference.
summary(movie[c(6,1,3,7,9,10,11)]) # summary of the numerical variables
## revenue budget popularity runtime
## Min. :5.00e+00 Min. :1.00e+00 Min. : 0 Min. : 41
## 1st Qu.:1.71e+07 1st Qu.:1.05e+07 1st Qu.: 10 1st Qu.: 96
## Median :5.52e+07 Median :2.50e+07 Median : 20 Median :107
## Mean :1.21e+08 Mean :4.07e+07 Mean : 29 Mean :111
## 3rd Qu.:1.46e+08 3rd Qu.:5.50e+07 3rd Qu.: 37 3rd Qu.:121
## Max. :2.79e+09 Max. :3.80e+08 Max. :876 Max. :338
## score vote profit
## Min. :2.30 Min. : 1 Min. :-1.66e+08
## 1st Qu.:5.80 1st Qu.: 179 1st Qu.: 2.52e+05
## Median :6.30 Median : 471 Median : 2.64e+07
## Mean :6.31 Mean : 978 Mean : 8.07e+07
## 3rd Qu.:6.90 3rd Qu.: 1148 3rd Qu.: 9.75e+07
## Max. :8.50 Max. :13752 Max. : 2.55e+09
Variance and SD
sapply(movie[c(6,1,3,7,9,10,11)], sd) # check the sd
## revenue budget popularity runtime score vote
## 1.86e+08 4.44e+07 3.62e+01 2.10e+01 8.60e-01 1.41e+03
## profit
## 1.58e+08
sapply(movie[c(6,1,3,7,9,10,11)], var) # check the var
## revenue budget popularity runtime score vote
## 3.47e+16 1.97e+15 1.31e+03 4.40e+02 7.39e-01 2.00e+06
## profit
## 2.50e+16
The means, variance and sd between variables are quite high as most of them have different scales. We need to scale the data for some models like linear regression, PCR, KNN, etc.
Numnber of movies by company
comp_count = data.frame(table(movie$company)) #count the number of movies by company and save to a dataframe
# plot the number of movies by company using ggplot()
ggplot(comp_count, aes(x = reorder(Var1, Freq), y = Freq)) + # order by the number of movies
geom_bar(stat = 'identity', fill ='gold', alpha = 0.9) + coord_flip() +
ylab("Number of movies") +
xlab("Companies") +
ggtitle(("Number of movies by companies")) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))
Number of movie by season
# we plot the number of movies by season
season_count = data.frame(table(movie$season))
ggplot(season_count, aes(x = Var1, y = Freq, fill = Var1)) +
geom_bar(stat = 'identity', alpha = 0.9) +
ylab("Number of movies") +
xlab("Season") +
ggtitle(("Number of movies by season")) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5)) +
guides(fill=guide_legend(title = "Season"))
# and number of movies by genre
genres_count = data.frame(table(movie$genres))
ggplot(genres_count, aes(x = reorder(Var1, Freq), y = Freq)) +
geom_bar(stat = 'identity', fill = "dodgerblue", alpha = 0.9) + coord_flip() +
ylab("Number of movies") +
xlab("Genre") +
ggtitle(("Number of movies by genre")) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))
In this section, we will try three models: Linear Regression, Decision Tree (Regression Tree since the response is continuous variable) and Random Forest. In each model, we will choose the best formula (adj R2, BIC, CP for Linear Regression, pruning for Regression Tree, tuning for Random Forest). And then we compare the three best models.
loadPkg("corrplot")
movie_num <- subset(movie, select = c(6,1,3,7,9,10)) # choose numerical variables excluding profit to be the data for later model
movie_num_all <- data.frame(cbind(movie_num,profit=movie$profit)) # this one includes profit
#cordf_all = cor(movie_num_all)
#corrplot(cordf_all)
cordf = cor(movie_num) # cor matrix
corrplot(cordf) # correlation plot
Test frequency distributions of revenue in different genres
anov_genre = aov(revenue ~ genres, data = movie) # anova test for the null: the means of revenue in different genres are the same
summary(anov_genre)
## Df Sum Sq Mean Sq F value Pr(>F)
## genres 17 1.4e+19 8.21e+17 26.9 <2e-16 ***
## Residuals 3207 9.8e+19 3.06e+16
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#adhoc_genre <- TukeyHSD(anov_genre) # ad hoc proc to check which pairs of genres have different means of profit
#adhoc_genre
Overall, there is an evidence that the frequency distributions of revenue in different genres are not the same. It seems that revenue is dependent on genres.
Check freq disbutions of revenue in different companies
anov_comp = aov(revenue ~ company, data = movie) # anova test for the null: the means of revenue in different companies are the same
summary(anov_comp)
## Df Sum Sq Mean Sq F value Pr(>F)
## company 5 4.80e+18 9.59e+17 28.8 <2e-16 ***
## Residuals 3219 1.07e+20 3.33e+16
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#adhoc_comp <- TukeyHSD(anov_comp) # ad hoc to check which pairs are significant
#adhoc_comp
Overall, there is an evidence that the frequency distributions of revenue in different companies are not the same. It seems that revenue is dependent on companies.
anov_ss= aov(revenue ~ season, data = movie) # anova test for the null: the means of revenue in different seasons are the same
summary(anov_ss)
## Df Sum Sq Mean Sq F value Pr(>F)
## season 3 1.59e+18 5.31e+17 15.5 5.3e-10 ***
## Residuals 3221 1.10e+20 3.43e+16
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
adhoc_ss <- TukeyHSD(anov_ss) # check which pairs are significant
adhoc_ss
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = revenue ~ season, data = movie)
##
## $season
## diff lwr upr p adj
## Summer-Spring 2809117 -21525012 27143245 0.991
## Fall-Spring -46272179 -70043962 -22500396 0.000
## Winter-Spring -37859214 -62797712 -12920716 0.001
## Fall-Summer -49081296 -71752657 -26409934 0.000
## Winter-Summer -40668330 -64560204 -16776456 0.000
## Winter-Fall 8412966 -14905901 31731832 0.790
It seems that winter - fall are in the same group and spring - summer are in the same group.
# scale the data since the vars and s.ds among variables are significant
# every numerical variables are scaled except the response "revenue"
scale_movie <- as.data.frame(scale(movie_num[c(2:6)], center = TRUE, scale = TRUE))
str(movie_num_all)
## 'data.frame': 3225 obs. of 7 variables:
## $ revenue : num 2.79e+09 9.61e+08 8.81e+08 1.08e+09 2.84e+08 ...
## $ budget : int 237000000 300000000 245000000 250000000 260000000 258000000 260000000 280000000 250000000 250000000 ...
## $ popularity: num 150.4 139.1 107.4 112.3 43.9 ...
## $ runtime : num 162 169 148 165 132 139 100 141 153 151 ...
## $ score : num 7.2 6.9 6.3 7.6 6.1 5.9 7.4 7.3 7.4 5.7 ...
## $ vote : int 11800 4500 4466 9106 2124 3576 3330 6767 5293 7004 ...
## $ profit : num 2.55e+09 6.61e+08 6.36e+08 8.35e+08 2.41e+07 ...
str(scale_movie)
## 'data.frame': 3225 obs. of 5 variables:
## $ budget : num 4.42 5.84 4.6 4.71 4.94 ...
## $ popularity: num 3.355 3.041 2.165 2.301 0.411 ...
## $ runtime : num 2.44 2.78 1.78 2.59 1.01 ...
## $ score : num 1.031 0.6821 -0.0157 1.4963 -0.2483 ...
## $ vote : num 7.65 2.49 2.47 5.74 0.81 ...
set.seed(1000)
movie_sample <- sample(2, nrow(scale_movie), replace=TRUE, prob=c(0.67, 0.33)) # split the data with ratio 67:33
xtrain1 <- scale_movie[movie_sample==1, 1:5] # train set contaning predictors
xtest1 <- scale_movie[movie_sample==2, 1:5] # test set containing predictors
ytrain1 <- movie_num[movie_sample==1, 1] # train set containing the response "revenue"
ytest1 <- movie_num[movie_sample==2, 1] # test set containing the response "revenue"
# we can combine them into an overall data frame
train1 <- as.data.frame(cbind(revenue = ytrain1,xtrain1))
test1 <- as.data.frame(cbind(revenue = ytest1,xtest1))
str(train1)
## 'data.frame': 2176 obs. of 6 variables:
## $ revenue : num 2.79e+09 8.81e+08 2.84e+08 8.91e+08 1.41e+09 ...
## $ budget : num 4.42 4.6 4.94 4.89 5.39 ...
## $ popularity: num 3.355 2.165 0.411 2.395 2.908 ...
## $ runtime : num 2.44 1.78 1.01 1.35 1.44 ...
## $ score : num 1.031 -0.0157 -0.2483 -0.4809 1.1474 ...
## $ vote : num 7.65 2.47 0.81 1.84 4.09 ...
str(test1)
## 'data.frame': 1049 obs. of 6 variables:
## $ revenue : num 9.61e+08 1.08e+09 5.92e+08 5.86e+08 8.93e+07 ...
## $ budget : num 5.84 4.71 4.94 3.59 4.83 ...
## $ popularity: num 3.041 2.301 0.542 2.18 0.552 ...
## $ runtime : num 2.779 2.588 -0.511 -0.225 1.825 ...
## $ score : num 0.682 1.496 1.264 -0.248 -0.481 ...
## $ vote : num 2.489 5.745 1.662 1.404 0.942 ...
Construct the model on training set (using all numberical variables)
loadPkg("faraway")
rev_lm1 <- lm(revenue ~., data = train1) # construct the model
summary(rev_lm1) # summary of the model
##
## Call:
## lm(formula = revenue ~ ., data = train1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.21e+08 -3.89e+07 -1.92e+06 2.46e+07 1.60e+09
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 122161921 2168480 56.34 < 2e-16 ***
## budget 82502895 2822244 29.23 < 2e-16 ***
## popularity 14588304 2952684 4.94 8.4e-07 ***
## runtime -1265467 2415512 -0.52 0.60
## score 212212 2648862 0.08 0.94
## vote 85807055 3723449 23.05 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.01e+08 on 2170 degrees of freedom
## Multiple R-squared: 0.711, Adjusted R-squared: 0.711
## F-statistic: 1.07e+03 on 5 and 2170 DF, p-value: <2e-16
vif(rev_lm1) # check vifs
## budget popularity runtime score vote
## 1.68 2.15 1.26 1.50 2.95
Prediction
Testing
# predict the model on test set
rev_lm_test1 <- predict(object = rev_lm1, test1) # predict the model on the test set
rev_lm_pred1 <- data.frame(cbind(actuals=test1$revenue, predicteds=rev_lm_test1)) # save the predicted and actual values in a data frame
rev_lm_metrics1 <- regr.eval(rev_lm_pred1$actuals, rev_lm_pred1$predicteds) # calculate metrics using regr.eval() function from DMwR package
# return the 4 metrics (RMSE, MSE, MAE, MAPE)
rev_lm_metrics1[c(1,3)] # show the metrics MAE and RMSE
## mae rmse
## 6.25e+07 1.06e+08
# I consulted some websites about the use of MAPE and they have some example about the disadvantages of this metric
# We should not use MAPE metric for comparsion between models, RMSE and MAE may be enough
Training
# predicted (fitted) values in train set
#rev_lm_train1 <- rev_lm1$fitted.values
rev_lm_train1 <- predict(object = rev_lm1) # same results as the above code
# we do not set the new data so the predict will use the fitted values
# it will give same results if using newdata = train1
# however, the results are different when using models that ensemble multiple models such as RF
# better use the correct format by ignoring newdata augment
# same results as the code rev_lm_train1 <- rev_lm1$fitted.values
rev_lm_predtrain1 <- data.frame(cbind(actuals=train1$revenue, predicteds=rev_lm_train1)) # save predicted and actual values in a dataframe
rev_lm_metrics_train1 <- regr.eval(rev_lm_predtrain1$actuals, rev_lm_predtrain1$predicteds)
rev_lm_metrics_train1[c(1,3)] # we can see that rmse of fitted values is the residual standard error
## mae rmse
## 5.86e+07 1.01e+08
reg.lm <- regsubsets(revenue~., data = train1, nbest = 1, method = "exhaustive") # leaps, exhaustive method
plot(reg.lm, scale = "adjr2", main = "Adjusted R^2")
plot(reg.lm, scale = "bic", main = "BIC")
plot(reg.lm, scale = "Cp", main = "Cp")
#summary(reg.lm)
All three feature selection methods show that predictors (budget + popularity + vote) form the best model.
Construct the model on train set
rev_lm2 <- lm(revenue ~ budget + popularity + vote, data = train1) # models with budget + popularity + vote
summary(rev_lm2)
##
## Call:
## lm(formula = revenue ~ budget + popularity + vote, data = train1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.21e+08 -3.85e+07 -2.19e+06 2.44e+07 1.60e+09
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.22e+08 2.17e+06 56.36 < 2e-16 ***
## budget 8.23e+07 2.62e+06 31.45 < 2e-16 ***
## popularity 1.46e+07 2.95e+06 4.96 7.8e-07 ***
## vote 8.57e+07 3.47e+06 24.72 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.01e+08 on 2172 degrees of freedom
## Multiple R-squared: 0.711, Adjusted R-squared: 0.711
## F-statistic: 1.78e+03 on 3 and 2172 DF, p-value: <2e-16
vif(rev_lm2)
## budget popularity vote
## 1.45 2.15 2.55
Prediction
rev_lm_test2 <- predict(object = rev_lm2, test1) # prediction on test set
rev_lm_pred2 <- data.frame(cbind(actuals=test1$revenue, predicteds=rev_lm_test1))
rev_lm_metrics2 <- regr.eval(rev_lm_pred2$actuals, rev_lm_pred2$predicteds) # metrics
rev_lm_metrics2[c(1,3)] # show the metrics
## mae rmse
## 6.25e+07 1.06e+08
rev_lm_train2 <- predict(object = rev_lm2) # predicted values on train set
rev_lm_predtrain2 <- data.frame(cbind(actuals=train1$revenue, predicteds=rev_lm_train1))
rev_lm_metrics_train2 <- regr.eval(rev_lm_predtrain2$actuals, rev_lm_predtrain2$predicteds) # metrics
rev_lm_metrics_train2[c(1,3)] # show the metrics
## mae rmse
## 5.86e+07 1.01e+08
No change when comparing to the model containing all numerical variables. We can remove unnecessary variables without reducing Adjusted R-squared or increasing RMSE. The best model is less complex and can avoid overfitting due to many predictors (high dimensionality).
# include the genres, company and season variables to the train and test sets
train1_full <- train1 # duplicate the train set as we want to keep the two train sets separately
train1_full$genres <- movie[movie_sample==1, 1:14]$genres # append genres to the train set
train1_full$company <- movie[movie_sample==1, 1:14]$company # append the company to the train set
train1_full$season <- movie[movie_sample==1, 1:14]$season # append season
# we do the same with test set
test1_full <- test1
test1_full$genres <- movie[movie_sample==2, 1:14]$genres
test1_full$company <- movie[movie_sample==2, 1:14]$company
test1_full$season <- movie[movie_sample==2, 1:14]$season
str(train1_full)
## 'data.frame': 2176 obs. of 9 variables:
## $ revenue : num 2.79e+09 8.81e+08 2.84e+08 8.91e+08 1.41e+09 ...
## $ budget : num 4.42 4.6 4.94 4.89 5.39 ...
## $ popularity: num 3.355 2.165 0.411 2.395 2.908 ...
## $ runtime : num 2.44 1.78 1.01 1.35 1.44 ...
## $ score : num 1.031 -0.0157 -0.2483 -0.4809 1.1474 ...
## $ vote : num 7.65 2.47 0.81 1.84 4.09 ...
## $ genres : Factor w/ 18 levels "Action","Adventure",..: 1 1 1 9 1 2 1 2 2 2 ...
## $ company : Factor w/ 6 levels "Others","Paramount Pictures",..: 1 3 5 3 5 6 6 6 5 5 ...
## $ season : Factor w/ 4 levels "Spring","Summer",..: 4 3 1 1 1 2 1 2 2 1 ...
str(test1_full)
## 'data.frame': 1049 obs. of 9 variables:
## $ revenue : num 9.61e+08 1.08e+09 5.92e+08 5.86e+08 8.93e+07 ...
## $ budget : num 5.84 4.71 4.94 3.59 4.83 ...
## $ popularity: num 3.041 2.301 0.542 2.18 0.552 ...
## $ runtime : num 2.779 2.588 -0.511 -0.225 1.825 ...
## $ score : num 0.682 1.496 1.264 -0.248 -0.481 ...
## $ vote : num 2.489 5.745 1.662 1.404 0.942 ...
## $ genres : Factor w/ 18 levels "Action","Adventure",..: 2 1 3 2 1 1 1 7 16 2 ...
## $ company : Factor w/ 6 levels "Others","Paramount Pictures",..: 5 1 5 1 5 1 1 2 4 1 ...
## $ season : Factor w/ 4 levels "Spring","Summer",..: 1 2 3 3 2 2 1 3 1 1 ...
rev_lm4 <- lm(revenue ~., data = train1_full)
summary(rev_lm4)
##
## Call:
## lm(formula = revenue ~ ., data = train1_full)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.08e+08 -4.03e+07 -1.43e+06 2.90e+07 1.62e+09
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 108110505 6817790 15.86 < 2e-16 ***
## budget 77348867 3033574 25.50 < 2e-16 ***
## popularity 13617667 2932282 4.64 3.6e-06 ***
## runtime 5900196 2594312 2.27 0.02305 *
## score -1602884 2751829 -0.58 0.56030
## vote 87028064 3716646 23.42 < 2e-16 ***
## genresAdventure 14998048 8669568 1.73 0.08378 .
## genresAnimation 87790816 13236699 6.63 4.2e-11 ***
## genresComedy 25268186 7163941 3.53 0.00043 ***
## genresCrime -10102172 11467478 -0.88 0.37845
## genresDocumentary 44711638 23188936 1.93 0.05397 .
## genresDrama 4923647 7294556 0.67 0.49976
## genresFamily 82467003 20391297 4.04 5.4e-05 ***
## genresFantasy 2870822 13660650 0.21 0.83357
## genresHistory 15689738 24998838 0.63 0.53032
## genresHorror 17619142 10531825 1.67 0.09448 .
## genresMusic 23079584 29354497 0.79 0.43182
## genresMystery 6103862 24024211 0.25 0.79946
## genresRomance 17057626 14926772 1.14 0.25327
## genresScience Fiction -15440346 15106799 -1.02 0.30686
## genresThriller -7679779 12337527 -0.62 0.53370
## genresWar -56563625 33719837 -1.68 0.09360 .
## genresWestern -3456414 24929422 -0.14 0.88974
## companyParamount Pictures 19120256 8303147 2.30 0.02139 *
## companySony Pictures 4555821 7970621 0.57 0.56767
## companyUniversal Pictures 16743343 7455117 2.25 0.02481 *
## companyWalt Disney 16706288 6373478 2.62 0.00882 **
## companyWarner Bros -535044 8559594 -0.06 0.95016
## seasonSummer -822742 6192423 -0.13 0.89431
## seasonFall -9929435 6117507 -1.62 0.10471
## seasonWinter -5891796 6282550 -0.94 0.34845
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 99300000 on 2145 degrees of freedom
## Multiple R-squared: 0.725, Adjusted R-squared: 0.721
## F-statistic: 188 on 30 and 2145 DF, p-value: <2e-16
vif(rev_lm4)
## budget popularity
## 2.01 2.20
## runtime score
## 1.51 1.68
## vote genresAdventure
## 3.04 1.40
## genresAnimation genresComedy
## 1.25 1.79
## genresCrime genresDocumentary
## 1.26 1.08
## genresDrama genresFamily
## 2.06 1.08
## genresFantasy genresHistory
## 1.14 1.07
## genresHorror genresMusic
## 1.32 1.04
## genresMystery genresRomance
## 1.04 1.13
## genresScience Fiction genresThriller
## 1.11 1.18
## genresWar genresWestern
## 1.03 1.06
## companyParamount Pictures companySony Pictures
## 1.09 1.11
## companyUniversal Pictures companyWalt Disney
## 1.12 1.18
## companyWarner Bros seasonSummer
## 1.08 1.61
## seasonFall seasonWinter
## 1.65 1.60
The p-values and t-values indicate that there is no significance among seasons. Season seems not to be a necessary predictor.
reg.lm1 <- regsubsets(revenue~., data = train1_full, nbest = 1, method = "exhaustive") # leaps, exhaustive method
plot(reg.lm1, scale = "adjr2", main = "Adjusted R^2")
plot(reg.lm1, scale = "bic", main = "BIC")
plot(reg.lm1, scale = "Cp", main = "Cp")
When inlcuding the season, genre and company in the model, the best numerical predictors are still budget, popularity and vote. The effects of different seasons seem not to be significant. We will build the model with these 3 numerical predictors and 2 categorical variables: genre and company.
rev_lm3 <- lm(revenue ~ budget + vote + company + genres + popularity , data = train1_full)
summary(rev_lm3)
##
## Call:
## lm(formula = revenue ~ budget + vote + company + genres + popularity,
## data = train1_full)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.14e+08 -4.04e+07 -8.25e+05 2.96e+07 1.62e+09
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 103783104 5493099 18.89 < 2e-16 ***
## budget 79307464 2810718 28.22 < 2e-16 ***
## vote 87343750 3432074 25.45 < 2e-16 ***
## companyParamount Pictures 20187352 8296723 2.43 0.01505 *
## companySony Pictures 4706890 7968099 0.59 0.55477
## companyUniversal Pictures 17875529 7436574 2.40 0.01631 *
## companyWalt Disney 16364447 6369330 2.57 0.01026 *
## companyWarner Bros -135185 8558273 -0.02 0.98740
## genresAdventure 14924242 8645045 1.73 0.08443 .
## genresAnimation 80203108 12811282 6.26 4.6e-10 ***
## genresComedy 24197175 7152449 3.38 0.00073 ***
## genresCrime -8821228 11302914 -0.78 0.43522
## genresDocumentary 40500854 22990580 1.76 0.07827 .
## genresDrama 6560315 6935298 0.95 0.34429
## genresFamily 78112299 20296238 3.85 0.00012 ***
## genresFantasy 1516100 13618042 0.11 0.91136
## genresHistory 22335236 24701813 0.90 0.36599
## genresHorror 15897705 10491528 1.52 0.12985
## genresMusic 22199706 29264598 0.76 0.44818
## genresMystery 3544411 24017226 0.15 0.88269
## genresRomance 16669177 14880624 1.12 0.26276
## genresScience Fiction -15795028 15101070 -1.05 0.29570
## genresThriller -7928898 12337416 -0.64 0.52051
## genresWar -55041648 33646312 -1.64 0.10201
## genresWestern 726641 24731085 0.03 0.97656
## popularity 13447493 2930278 4.59 4.7e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 99400000 on 2150 degrees of freedom
## Multiple R-squared: 0.724, Adjusted R-squared: 0.721
## F-statistic: 225 on 25 and 2150 DF, p-value: <2e-16
vif(rev_lm3)
## budget vote
## 1.73 2.59
## companyParamount Pictures companySony Pictures
## 1.09 1.11
## companyUniversal Pictures companyWalt Disney
## 1.11 1.17
## companyWarner Bros genresAdventure
## 1.08 1.39
## genresAnimation genresComedy
## 1.17 1.78
## genresCrime genresDocumentary
## 1.22 1.06
## genresDrama genresFamily
## 1.86 1.07
## genresFantasy genresHistory
## 1.13 1.04
## genresHorror genresMusic
## 1.30 1.03
## genresMystery genresRomance
## 1.04 1.12
## genresScience Fiction genresThriller
## 1.11 1.17
## genresWar genresWestern
## 1.03 1.04
## popularity
## 2.19
The adj R-squared increases by 1.0% comparing to the the best model with numerical variables.
Testing
#loadPkg("MLmetrics")
rev_lm_test3 <- predict(object = rev_lm3, newdata = test1_full) # prediction on test set
rev_lm_predtest3 <- data.frame(cbind(actuals=test1_full$revenue, predicteds=rev_lm_test3))
rev_lm_metrics_test3 <- regr.eval(rev_lm_predtest3$actuals, rev_lm_predtest3$predicteds) # metrics (RMSE, MAE, MSE, MAPE)
rev_lm_metrics_test3[c(1,3)] # show the metrics
## mae rmse
## 6.19e+07 1.05e+08
Training
rev_lm_train3 <- predict(object = rev_lm3) # predicted values in train set
rev_lm_predtrain3 <- data.frame(cbind(actuals=train1_full$revenue, predicteds=rev_lm_train3))
rev_lm_metrics_train3 <- regr.eval(rev_lm_predtrain3$actuals, rev_lm_predtrain3$predicteds) # metrics (RMSE, MAE, MSE, MAPE)
options(scientific=T, digits = 2) # to make the format of metrics the same as above metrics (as Xe+07)
rev_lm_metrics_train3[c(1,3)]
## mae rmse
## 5.8e+07 9.9e+07
options(scientific=T, digits = 3) # change back to our default format
A slight improvement in this model. Adj R-squared increase by 1% and RMSE in both training set and testing set slightly decrease.
AIC(object=rev_lm2) # best model with budget + popularity + vote
## [1] 86395
BIC(object=rev_lm2)
## [1] 86424
cat("\n")
AIC(object=rev_lm3) # best model including genres and company
## [1] 86343
BIC(object=rev_lm3)
## [1] 86497
Model 1: budget + vote + popularity Model 2: budget + vote + popularity + company + genres
Model 1 has higher AIC than Model 2, which indicates that Model 2 is better for predicting the revenue. Model 1 has lower BIC than Model 2, which indicates that Model 1 is better as a true function to explain the revenue. (BIC prefers simple models)
With Decision Tree we can address both numerical and categorical variables in the model. We perform the prediction of revenue (a continuous response) so we use regression tree.
We can try two functions to build a regression tree model
rev_tree0 <- tree(revenue ~ ., data=train1_full) # use tree function
summary(rev_tree0)
##
## Regression tree:
## tree(formula = revenue ~ ., data = train1_full)
## Variables actually used in tree construction:
## [1] "vote" "budget" "genres"
## Number of terminal nodes: 10
## Residual mean deviance: 9.85e+15 = 2.13e+19 / 2170
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -6.75e+08 -3.76e+07 -1.25e+07 0.00e+00 2.52e+07 1.24e+09
plot(rev_tree0)
text(rev_tree0,cex=0.7)
rev_tree <- rpart(revenue ~ ., method ="anova", data=train1_full) #use rpart with method = "anova"
#plotcp(rev_tree ) # visualize cross-validation results
summary(rev_tree ) # detailed summary of splits
## Call:
## rpart(formula = revenue ~ ., data = train1_full, method = "anova")
## n= 2176
##
## CP nsplit rel error xerror xstd
## 1 0.3861 0 1.000 1.001 0.1190
## 2 0.1250 1 0.614 0.709 0.0853
## 3 0.0704 2 0.489 0.535 0.0660
## 4 0.0528 3 0.418 0.483 0.0641
## 5 0.0265 4 0.366 0.455 0.0642
## 6 0.0256 5 0.339 0.452 0.0643
## 7 0.0139 6 0.313 0.412 0.0525
## 8 0.0107 7 0.300 0.383 0.0515
## 9 0.0100 8 0.289 0.385 0.0516
##
## Variable importance
## vote popularity budget score genres season
## 42 22 20 7 6 1
## runtime company
## 1 1
##
## Node number 1: 2176 observations, complexity param=0.386
## mean=1.2e+08, MSE=3.53e+16
## left son=2 (1947 obs) right son=3 (229 obs)
## Primary splits:
## vote < 0.949 to the left, improve=0.3860, (0 missing)
## budget < 1.17 to the left, improve=0.3840, (0 missing)
## popularity < 1.27 to the left, improve=0.3370, (0 missing)
## genres splits as RRRLLLLRRLLLLLRLLL, improve=0.0995, (0 missing)
## runtime < 0.609 to the left, improve=0.0587, (0 missing)
## Surrogate splits:
## popularity < 0.833 to the left, agree=0.948, adj=0.502, (0 split)
## budget < 2.45 to the left, agree=0.911, adj=0.153, (0 split)
## score < 1.79 to the left, agree=0.902, adj=0.066, (0 split)
##
## Node number 2: 1947 observations, complexity param=0.0704
## mean=8e+07, MSE=9.5e+15
## left son=4 (1481 obs) right son=5 (466 obs)
## Primary splits:
## vote < -0.099 to the left, improve=0.2930, (0 missing)
## popularity < -0.00948 to the left, improve=0.2540, (0 missing)
## budget < 0.716 to the left, improve=0.2480, (0 missing)
## company splits as LRRRRR, improve=0.0571, (0 missing)
## genres splits as LRRLLLLRRLLLLLLLLL, improve=0.0563, (0 missing)
## Surrogate splits:
## popularity < 0.0996 to the left, agree=0.922, adj=0.674, (0 split)
## budget < 1.09 to the left, agree=0.791, adj=0.129, (0 split)
## score < 2.02 to the left, agree=0.762, adj=0.004, (0 split)
## runtime < 4.4 to the left, agree=0.761, adj=0.002, (0 split)
##
## Node number 3: 229 observations, complexity param=0.125
## mean=4.61e+08, MSE=1.25e+17
## left son=6 (101 obs) right son=7 (128 obs)
## Primary splits:
## budget < 0.739 to the left, improve=0.335, (0 missing)
## vote < 2.5 to the left, improve=0.269, (0 missing)
## genres splits as RRRLL-LRRLL-LLRLLR, improve=0.210, (0 missing)
## popularity < 1.8 to the left, improve=0.176, (0 missing)
## runtime < 1.18 to the left, improve=0.093, (0 missing)
## Surrogate splits:
## genres splits as RRRLL-LRRLL-LLRLLR, agree=0.790, adj=0.525, (0 split)
## score < 1.44 to the right, agree=0.716, adj=0.356, (0 split)
## vote < 1.97 to the left, agree=0.633, adj=0.168, (0 split)
## popularity < 1.15 to the left, agree=0.607, adj=0.109, (0 split)
## season splits as RRLR, agree=0.607, adj=0.109, (0 split)
##
## Node number 4: 1481 observations, complexity param=0.0139
## mean=5.04e+07, MSE=3.52e+15
## left son=8 (772 obs) right son=9 (709 obs)
## Primary splits:
## vote < -0.497 to the left, improve=0.2040, (0 missing)
## budget < -0.32 to the left, improve=0.1950, (0 missing)
## popularity < -0.38 to the left, improve=0.1820, (0 missing)
## company splits as LRRRRR, improve=0.0553, (0 missing)
## runtime < 0.18 to the left, improve=0.0165, (0 missing)
## Surrogate splits:
## popularity < -0.389 to the left, agree=0.916, adj=0.825, (0 split)
## budget < -0.32 to the left, agree=0.630, adj=0.227, (0 split)
## genres splits as RLRLRLLLRLRLRLRRRL, agree=0.568, adj=0.097, (0 split)
## company splits as LRLRLL, agree=0.565, adj=0.092, (0 split)
## score < -0.423 to the left, agree=0.554, adj=0.068, (0 split)
##
## Node number 5: 466 observations, complexity param=0.0256
## mean=1.74e+08, MSE=1.69e+16
## left son=10 (322 obs) right son=11 (144 obs)
## Primary splits:
## budget < 0.649 to the left, improve=0.2500, (0 missing)
## genres splits as LLRLL-LRLRLLLLLLLL, improve=0.1220, (0 missing)
## company splits as LRRRRR, improve=0.0808, (0 missing)
## score < 0.973 to the right, improve=0.0708, (0 missing)
## vote < 0.432 to the left, improve=0.0478, (0 missing)
## Surrogate splits:
## genres splits as LRRLL-LRLRLLLLLLLL, agree=0.755, adj=0.208, (0 split)
## score < -0.423 to the right, agree=0.710, adj=0.063, (0 split)
## popularity < 1.54 to the left, agree=0.695, adj=0.014, (0 split)
## vote < 0.909 to the left, agree=0.695, adj=0.014, (0 split)
##
## Node number 6: 101 observations, complexity param=0.0107
## mean=2.3e+08, MSE=3.14e+16
## left son=12 (85 obs) right son=13 (16 obs)
## Primary splits:
## genres splits as LRRLL-L-LLL-RLLLL-, improve=0.2590, (0 missing)
## vote < 2.55 to the left, improve=0.2120, (0 missing)
## budget < -0.192 to the left, improve=0.1350, (0 missing)
## popularity < 1.81 to the left, improve=0.0997, (0 missing)
## season splits as RRLR, improve=0.0770, (0 missing)
## Surrogate splits:
## runtime < -0.941 to the right, agree=0.851, adj=0.063, (0 split)
## score < -0.365 to the right, agree=0.851, adj=0.063, (0 split)
##
## Node number 7: 128 observations, complexity param=0.0528
## mean=6.43e+08, MSE=1.24e+17
## left son=14 (71 obs) right son=15 (57 obs)
## Primary splits:
## vote < 2.44 to the left, improve=0.2550, (0 missing)
## budget < 3.78 to the left, improve=0.2390, (0 missing)
## popularity < 3.14 to the left, improve=0.2330, (0 missing)
## runtime < 1.13 to the left, improve=0.0954, (0 missing)
## score < -0.772 to the left, improve=0.0715, (0 missing)
## Surrogate splits:
## score < 0.624 to the left, agree=0.719, adj=0.368, (0 split)
## popularity < 1.67 to the left, agree=0.711, adj=0.351, (0 split)
## runtime < 1.13 to the left, agree=0.656, adj=0.228, (0 split)
## budget < 2.57 to the left, agree=0.641, adj=0.193, (0 split)
## company splits as LLLLRR, agree=0.633, adj=0.175, (0 split)
##
## Node number 8: 772 observations
## mean=2.47e+07, MSE=7.93e+14
##
## Node number 9: 709 observations
## mean=7.84e+07, MSE=4.98e+15
##
## Node number 10: 322 observations
## mean=1.3e+08, MSE=9.82e+15
##
## Node number 11: 144 observations
## mean=2.71e+08, MSE=1.91e+16
##
## Node number 12: 85 observations
## mean=1.91e+08, MSE=1.99e+16
##
## Node number 13: 16 observations
## mean=4.38e+08, MSE=4.15e+16
##
## Node number 14: 71 observations
## mean=4.83e+08, MSE=4.95e+16
##
## Node number 15: 57 observations, complexity param=0.0265
## mean=8.41e+08, MSE=1.46e+17
## left son=30 (47 obs) right son=31 (10 obs)
## Primary splits:
## budget < 3.98 to the left, improve=0.2450, (0 missing)
## popularity < 2.88 to the left, improve=0.2080, (0 missing)
## vote < 4.63 to the left, improve=0.1260, (0 missing)
## score < 1.32 to the right, improve=0.0656, (0 missing)
## genres splits as RRR-L-LRL-----R--L, improve=0.0449, (0 missing)
## Surrogate splits:
## vote < 7.31 to the left, agree=0.842, adj=0.1, (0 split)
##
## Node number 30: 47 observations
## mean=7.54e+08, MSE=6.86e+16
##
## Node number 31: 10 observations
## mean=1.25e+09, MSE=3.06e+17
#printcp(rev_tree ) # display the results
rev_tree.rsq = 1 - printcp(rev_tree)[9,3] # we can calculate R-quared using equation: R-squared = 1 - rel error
##
## Regression tree:
## rpart(formula = revenue ~ ., data = train1_full, method = "anova")
##
## Variables actually used in tree construction:
## [1] budget genres vote
##
## Root node error: 8e+19/2176 = 4e+16
##
## n= 2176
##
## CP nsplit rel error xerror xstd
## 1 0.39 0 1.0 1.0 0.12
## 2 0.13 1 0.6 0.7 0.09
## 3 0.07 2 0.5 0.5 0.07
## 4 0.05 3 0.4 0.5 0.06
## 5 0.03 4 0.4 0.5 0.06
## 6 0.03 5 0.3 0.5 0.06
## 7 0.01 6 0.3 0.4 0.05
## 8 0.01 7 0.3 0.4 0.05
## 9 0.01 8 0.3 0.4 0.05
# I see this equation in stackoverflow.com
# I'm not sure if it is 100% correct
# visualize the tree using 2 methods prp() and fancyRpartPlot()
prp(rev_tree, main = "Classification Trees for Revenue")
fancyRpartPlot(rev_tree)
2 methods give the same tree. rpart() gives access to nicer plots.
Testing
rev_tree_pred.test <- predict(rev_tree, newdata = test1_full)
actual_pred_tree.test <- as.data.frame(cbind(actuals = test1_full$revenue, predicteds = rev_tree_pred.test))
metrics_test_tree <- regr.eval(actual_pred_tree.test$actuals, actual_pred_tree.test$predicteds)
metrics_test_tree[c(1,3)]
## mae rmse
## 6.54e+07 1.13e+08
Training
rev_tree_pred.train <- predict(rev_tree, train1_full)
actual_pred_tree.train <- as.data.frame(cbind(actuals = train1_full$revenue, predicteds = rev_tree_pred.train))
metrics_train_tree <- regr.eval(actual_pred_tree.train$actuals, actual_pred_tree.train$predicteds)
metrics_train_tree[c(1,3)]
## mae rmse
## 5.93e+07 1.01e+08
The results seem to be worse than linear models.
We can see the error for each Cp.
printcp(rev_tree)
##
## Regression tree:
## rpart(formula = revenue ~ ., data = train1_full, method = "anova")
##
## Variables actually used in tree construction:
## [1] budget genres vote
##
## Root node error: 8e+19/2176 = 4e+16
##
## n= 2176
##
## CP nsplit rel error xerror xstd
## 1 0.39 0 1.0 1.0 0.12
## 2 0.13 1 0.6 0.7 0.09
## 3 0.07 2 0.5 0.5 0.07
## 4 0.05 3 0.4 0.5 0.06
## 5 0.03 4 0.4 0.5 0.06
## 6 0.03 5 0.3 0.5 0.06
## 7 0.01 6 0.3 0.4 0.05
## 8 0.01 7 0.3 0.4 0.05
## 9 0.01 8 0.3 0.4 0.05
plotcp(rev_tree) # visualize cross-validation results
# prune the tree for optimization and avoid overfitting
rev_tree$cptable[,"xerror"] # we can see the lowest xerror
## 1 2 3 4 5 6 7 8 9
## 1.001 0.709 0.535 0.483 0.455 0.452 0.412 0.383 0.385
The error is lowest at CP = 8.
# choose Cp with mininum xerror and prune the tree
rev_tree_prune <- prune(rev_tree, cp = rev_tree$cptable[8,"CP"])
fancyRpartPlot(rev_tree_prune)
Testing
# choose Cp with mininum xerror and prune the tree
#rev_tree_prune <- prune(rev_tree, cp = rev_tree$cptable[8,"CP"])
# testing
rev_tree_pred.test <- predict(rev_tree_prune, newdata = test1_full)
actual_pred_tree.test <- as.data.frame(cbind(actuals = test1_full$revenue, predicteds = rev_tree_pred.test))
metrics_test_tree <- regr.eval(actual_pred_tree.test$actuals, actual_pred_tree.test$predicteds)
metrics_test_tree[c(1,3)]
## mae rmse
## 6.59e+07 1.15e+08
Training
# training
rev_tree_pred.train <- predict(rev_tree_prune)
actual_pred_tree.train <- as.data.frame(cbind(actuals = train1_full$revenue, predicteds = rev_tree_pred.train))
metrics_train_tree <- regr.eval(actual_pred_tree.train$actuals, actual_pred_tree.train$predicteds)
metrics_train_tree[c(1,3)]
## mae rmse
## 6.04e+07 1.03e+08
# we can see the results are the same as in the original tree
There are not significant changes in the results.
The data contains many variables. Furthermore, we are predicting a testing data using the model constructed on training set. Therefore, Random Forest (RF) would be better than Decision Tree (which prefers fewer variables and predicts within the sample/training data).
set.seed(123) # set.seed to make sure the selection of tree samples are the same if we re-run the code
rev_rf = randomForest(revenue~. , train1_full, ntree = 350) # construct RF model on training data, we set the number of trees as 350
rev_rf # the results of the model
##
## Call:
## randomForest(formula = revenue ~ ., data = train1_full, ntree = 350)
## Type of random forest: regression
## Number of trees: 350
## No. of variables tried at each split: 2
##
## Mean of squared residuals: 9.46e+15
## % Var explained: 73.2
randomForest::importance(rev_rf) # the importance of each predictor
## IncNodePurity
## budget 1.90e+19
## popularity 1.43e+19
## runtime 4.15e+18
## score 3.39e+18
## vote 2.33e+19
## genres 5.58e+18
## company 2.60e+18
## season 1.49e+18
Budget + popularity + score have highest importances. In our best linear model, the selected numerical variables are also the same as Random Forest. Two models seem to have an agreement.
There is a difference when accommodating categorical variables. Linear Model does not select runtime but selects company and genres to be added, while in Random Forest model runtime is more importanct than company. In this case, both models seem to agree on the genres variable only.
plot(rev_rf)
#rev_rf$mse
When the number of trees increase, the mean squared error MSE decease. After a number of trees (around 100 trees in our case), the MSE does not have any significant change.
Testing
# predicting the model on the testing data
rev_rf.test <- predict(rev_rf, test1_full)
actual_pred_rf.test <- as.data.frame(cbind(actuals = test1_full$revenue, predicteds = rev_rf.test))
metrics_test_rf <- regr.eval(actual_pred_rf.test$actuals, actual_pred_rf.test$predicteds)
options(scientific=T, digits = 2) # to make the format of metrics the same as above metrics (as Xe+07)
metrics_test_rf[c(1,3)]
## mae rmse
## 5.5e+07 9.9e+07
options(scientific=T, digits = 3) # change back to our default format
Training
#rev_rf$predicted
rev_rf.train <- predict(rev_rf) # same results as rev_rf$predicted, both codes return the predicted values in the training set
actual_pred_rf.train <- as.data.frame(cbind(actuals = train1_full$revenue, predicteds = rev_rf.train))
metrics_train_rf <- regr.eval(actual_pred_rf.train$actuals, actual_pred_rf.train$predicteds)
options(scientific=T, digits = 2) # to make the format of metrics the same as above metrics (as Xe+07)
metrics_train_rf[c(1,3)]
## mae rmse
## 5.3e+07 9.7e+07
options(scientific=T, digits = 3) # change back to our default format
Random Forest has lower RMSEs and MAEs than Linear Model.
The pseudo R-squared of RF is slightly higher than Adj R-squared of Linear Model.
Let’s look if we can make our RF model better by using tuning method.
# there is a parameter called mtry which is used to tune a random forest model
# we use tuneRF() function to train different models with different mtry and see which mtry gives lowest OOB (Out-Of-Bag) Error
set.seed(123)
tune.rf <- tuneRF(x = train1_full[-c(1)], y = train1_full$revenue, ntreeTry = 350)
## mtry = 2 OOB error = 9.46e+15
## Searching left ...
## mtry = 1 OOB error = 1.06e+16
## -0.118 0.05
## Searching right ...
## mtry = 4 OOB error = 9.33e+15
## 0.014 0.05
best_mtry <- tune.rf[,"mtry"][which.min(tune.rf[,"OOBError"])] # retrieve the mtry with lowest OOB error
#best_mtry # show the best mtry value
mtry giving lowest OOB Error is 4. Now, let’s build a RF model with mtry = 4.
set.seed(123) # set.seed to make sure the selection of tree samples are the same if we re-run the code
rev_rf.tune = randomForest(revenue~. , train1_full, mtry = 4, ntree = 350)
rev_rf.tune
##
## Call:
## randomForest(formula = revenue ~ ., data = train1_full, mtry = 4, ntree = 350)
## Type of random forest: regression
## Number of trees: 350
## No. of variables tried at each split: 4
##
## Mean of squared residuals: 9.17e+15
## % Var explained: 74
An improvement in pseudo R-squared. %Var explained increases by 1%.
Testing
# predicting the model on the testing data
rev_rf.test1 <- predict(rev_rf.tune, test1_full)
actual_pred_rf.test1 <- as.data.frame(cbind(actuals = test1_full$revenue, predicteds = rev_rf.test1))
metrics_test_rf1 <- regr.eval(actual_pred_rf.test1$actuals, actual_pred_rf.test1$predicteds)
options(scientific=T, digits = 2) # change format for easier comparison with metrics from previous model
metrics_test_rf1[c(1,3)]
## mae rmse
## 5.5e+07 9.7e+07
Training
#rev_rf.tune$predicted
rev_rf.train1 <- predict(rev_rf.tune) # same results as rev_rf.tune$predicted, both codes return the predicted values in the training set
actual_pred_rf.train1 <- as.data.frame(cbind(actuals = train1_full$revenue, predicteds = rev_rf.train1))
metrics_train_rf1 <- regr.eval(actual_pred_rf.train1$actuals, actual_pred_rf.train1$predicteds)
metrics_train_rf1[c(1,3)]
## mae rmse
## 5.2e+07 9.6e+07
options(scientific=T, digits = 3) # change back to our initial format
We can see the decreases in MAE and RMSE after tuning the forest.
| Model | Linear Model (budget + vote + popularity + company + genres) | Regression Tree | Random Forest |
|---|---|---|---|
| R-squared(adjusted/ pseudo) | 0.721 | 0.711 | 0.741 |
| MAE - train | 5.8e+07 | 6e+07 | 5.2e+07 |
| MAE - test | 6.2e+07 | 6.6e+07 | 5.5e+07 |
| RMSE - train | 9.9e+07 | 1e+08 | 9.6e+07 |
| RMSE - test | 1e+08 | 1.1e+08 | 9.7e+07 |
Random Forest has the best performance among three models. There is no potential overfitting as well.
str(movies)
## 'data.frame': 3225 obs. of 15 variables:
## $ X : int 0 1 2 3 4 5 6 7 8 9 ...
## $ budget : int 237000000 300000000 245000000 250000000 260000000 258000000 260000000 280000000 250000000 250000000 ...
## $ genres : Factor w/ 18 levels "Action","Adventure",..: 1 2 1 1 1 9 3 1 2 1 ...
## $ popularity: num 150.4 139.1 107.4 112.3 43.9 ...
## $ company : Factor w/ 6 levels "Others","Paramount Pictures",..: 1 5 3 1 5 3 5 5 6 6 ...
## $ date : Date, format: "2009-12-10" "2007-05-19" ...
## $ revenue : num 2.79e+09 9.61e+08 8.81e+08 1.08e+09 2.84e+08 ...
## $ runtime : num 162 169 148 165 132 139 100 141 153 151 ...
## $ title : Factor w/ 3224 levels "(500) Days of Summer",..: 259 1761 2129 2420 1265 2139 2256 260 1053 310 ...
## $ score : num 7.2 6.9 6.3 7.6 6.1 5.9 7.4 7.3 7.4 5.7 ...
## $ vote : int 11800 4500 4466 9106 2124 3576 3330 6767 5293 7004 ...
## $ genrecount: int 4 3 3 4 3 3 2 3 3 3 ...
## $ profit : num 2.55e+09 6.61e+08 6.36e+08 8.35e+08 2.41e+07 ...
## $ profitable: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ month : num 12 5 10 7 3 5 11 4 7 3 ...
movies_reg <- subset(movies, select = - c(X,title,date,genres,genrecount,company,revenue,profitable,month))
c <- cor(movies_reg)
corrplot(c, method = "square")
We get that budget, popularity and vote have relatively strong correlations with profit. While runtime and score have moderate correlation. And the correlation between profit and month is really weak.
M = factor(movies$month)
dummm = as.data.frame(model.matrix(~M)[,-1])
movie_M = cbind(movie, dummm)
movies_c <- subset(movie_M, select = c(profit,M2,M3,M4,M6,M6,M7,M8,M9,M10,M11,M12))
cc <- cor(movies_c)
corrplot(cc, method = "square")
#str(movie_M)
library(data.table)
#apply one hot coding on the company
FactoredVariable = factor(movie_M$company)
dumm = as.data.frame(model.matrix(~FactoredVariable)[,-1])
movie_OH = cbind(movie_M, dumm)
#Rename the colume name
names(movie_OH)[names(movie_OH) == 'FactoredVariableParamount Pictures'] <- 'Paramount'
names(movie_OH)[names(movie_OH) == 'FactoredVariableSony Pictures'] <- 'Sony'
names(movie_OH)[names(movie_OH) == 'FactoredVariableUniversal Pictures'] <- 'Universal'
names(movie_OH)[names(movie_OH) == 'FactoredVariableWalt Disney'] <- 'Disney'
names(movie_OH)[names(movie_OH) == 'FactoredVariableWarner Bros'] <- 'Warner Bro'
#
movies_cor <- subset(movie_OH, select = c(profit,Paramount,Sony,Universal,Disney,`Warner Bro`))
c <- cor(movies_cor)
print(c)
## profit Paramount Sony Universal Disney Warner Bro
## profit 1.00000 0.0437 0.00881 0.0952 0.118 -0.00723
## Paramount 0.04371 1.0000 -0.08982 -0.1003 -0.125 -0.07967
## Sony 0.00881 -0.0898 1.00000 -0.1049 -0.131 -0.08334
## Universal 0.09516 -0.1003 -0.10488 1.0000 -0.146 -0.09303
## Disney 0.11825 -0.1251 -0.13084 -0.1460 1.000 -0.11605
## Warner Bro -0.00723 -0.0797 -0.08334 -0.0930 -0.116 1.00000
G = factor(movie_OH$genres)
dumm = as.data.frame(model.matrix(~G)[,-1])
movie_OH = cbind(movie_OH, dumm)
#str(movie_OH)
##
## Call:
## lm(formula = profit ~ ., data = movie_r)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.837 -0.253 -0.014 0.189 10.196
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.07441 0.11317 -0.66 0.51091
## budget 0.16418 0.01578 10.40 < 2e-16 ***
## popularity 0.05699 0.01697 3.36 0.00080 ***
## runtime 0.03582 0.01383 2.59 0.00963 **
## vote 0.58674 0.01960 29.93 < 2e-16 ***
## genresAdventure 0.18645 0.04616 4.04 5.5e-05 ***
## genresAnimation 0.50587 0.07206 7.02 2.7e-12 ***
## genresComedy 0.17053 0.03760 4.53 6.0e-06 ***
## genresCrime -0.01589 0.06085 -0.26 0.79394
## genresDocumentary 0.32226 0.12060 2.67 0.00758 **
## genresDrama 0.04765 0.03790 1.26 0.20877
## genresFamily 0.51303 0.10739 4.78 1.9e-06 ***
## genresFantasy 0.05216 0.07115 0.73 0.46350
## genresHistory 0.08686 0.15304 0.57 0.57038
## genresHorror 0.16382 0.05392 3.04 0.00240 **
## genresMusic 0.16828 0.14481 1.16 0.24529
## genresMystery 0.06660 0.12490 0.53 0.59390
## genresRomance 0.17402 0.08071 2.16 0.03115 *
## genresScience Fiction 0.00999 0.07638 0.13 0.89598
## genresThriller -0.02116 0.06430 -0.33 0.74216
## genresWar -0.21265 0.15214 -1.40 0.16231
## genresWestern -0.05912 0.13885 -0.43 0.67031
## month2 -0.01471 0.06234 -0.24 0.81349
## month3 -0.01656 0.06177 -0.27 0.78859
## month4 0.06175 0.06284 0.98 0.32582
## month5 0.18487 0.06204 2.98 0.00291 **
## month6 0.20766 0.05989 3.47 0.00053 ***
## month7 0.01491 0.06081 0.25 0.80628
## month8 0.02120 0.05933 0.36 0.72085
## month9 0.00411 0.05627 0.07 0.94174
## month10 -0.05029 0.05925 -0.85 0.39612
## month11 0.13956 0.06157 2.27 0.02348 *
## month12 0.14675 0.05874 2.50 0.01252 *
## score -0.02059 0.01684 -1.22 0.22132
## companyParamount Pictures 0.10903 0.04318 2.53 0.01162 *
## companySony Pictures 0.03610 0.04167 0.87 0.38643
## companyUniversal Pictures 0.15210 0.03843 3.96 7.7e-05 ***
## companyWalt Disney 0.10309 0.03342 3.08 0.00205 **
## companyWarner Bros 0.00720 0.04564 0.16 0.87470
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.632 on 3186 degrees of freedom
## Multiple R-squared: 0.605, Adjusted R-squared: 0.601
## F-statistic: 129 on 38 and 3186 DF, p-value: <2e-16
loadPkg("leaps")
loadPkg("ISLR")
#This is essentially best fit
reg.best <- regsubsets(profit~. , data = movie_r, nvmax = 14, nbest = 1, method = "backward")
plot(reg.best, scale = "adjr2", main = "Adjusted R^2")
plot(reg.best, scale = "r2", main = "R^2")
# In the "leaps" package, we can use scale=c("bic","Cp","adjr2","r2")
plot(reg.best, scale = "bic", main = "BIC")
plot(reg.best, scale = "Cp", main = "Cp")
summary(reg.best)
## Subset selection object
## Call: regsubsets.formula(profit ~ ., data = movie_r, nvmax = 14, nbest = 1,
## method = "backward")
## 38 Variables (and intercept)
## Forced in Forced out
## budget FALSE FALSE
## popularity FALSE FALSE
## runtime FALSE FALSE
## vote FALSE FALSE
## genresAdventure FALSE FALSE
## genresAnimation FALSE FALSE
## genresComedy FALSE FALSE
## genresCrime FALSE FALSE
## genresDocumentary FALSE FALSE
## genresDrama FALSE FALSE
## genresFamily FALSE FALSE
## genresFantasy FALSE FALSE
## genresHistory FALSE FALSE
## genresHorror FALSE FALSE
## genresMusic FALSE FALSE
## genresMystery FALSE FALSE
## genresRomance FALSE FALSE
## genresScience Fiction FALSE FALSE
## genresThriller FALSE FALSE
## genresWar FALSE FALSE
## genresWestern FALSE FALSE
## month2 FALSE FALSE
## month3 FALSE FALSE
## month4 FALSE FALSE
## month5 FALSE FALSE
## month6 FALSE FALSE
## month7 FALSE FALSE
## month8 FALSE FALSE
## month9 FALSE FALSE
## month10 FALSE FALSE
## month11 FALSE FALSE
## month12 FALSE FALSE
## score FALSE FALSE
## companyParamount Pictures FALSE FALSE
## companySony Pictures FALSE FALSE
## companyUniversal Pictures FALSE FALSE
## companyWalt Disney FALSE FALSE
## companyWarner Bros FALSE FALSE
## 1 subsets of each size up to 14
## Selection Algorithm: backward
## budget popularity runtime vote genresAdventure genresAnimation
## 1 ( 1 ) " " " " " " "*" " " " "
## 2 ( 1 ) "*" " " " " "*" " " " "
## 3 ( 1 ) "*" " " " " "*" " " "*"
## 4 ( 1 ) "*" " " " " "*" " " "*"
## 5 ( 1 ) "*" " " " " "*" " " "*"
## 6 ( 1 ) "*" " " " " "*" " " "*"
## 7 ( 1 ) "*" " " " " "*" " " "*"
## 8 ( 1 ) "*" " " " " "*" " " "*"
## 9 ( 1 ) "*" " " " " "*" "*" "*"
## 10 ( 1 ) "*" " " " " "*" "*" "*"
## 11 ( 1 ) "*" " " " " "*" "*" "*"
## 12 ( 1 ) "*" "*" " " "*" "*" "*"
## 13 ( 1 ) "*" "*" " " "*" "*" "*"
## 14 ( 1 ) "*" "*" " " "*" "*" "*"
## genresComedy genresCrime genresDocumentary genresDrama
## 1 ( 1 ) " " " " " " " "
## 2 ( 1 ) " " " " " " " "
## 3 ( 1 ) " " " " " " " "
## 4 ( 1 ) " " " " " " " "
## 5 ( 1 ) " " " " " " " "
## 6 ( 1 ) " " " " " " " "
## 7 ( 1 ) " " " " " " " "
## 8 ( 1 ) "*" " " " " " "
## 9 ( 1 ) "*" " " " " " "
## 10 ( 1 ) "*" " " " " " "
## 11 ( 1 ) "*" " " " " " "
## 12 ( 1 ) "*" " " " " " "
## 13 ( 1 ) "*" " " " " " "
## 14 ( 1 ) "*" " " " " " "
## genresFamily genresFantasy genresHistory genresHorror
## 1 ( 1 ) " " " " " " " "
## 2 ( 1 ) " " " " " " " "
## 3 ( 1 ) " " " " " " " "
## 4 ( 1 ) " " " " " " " "
## 5 ( 1 ) "*" " " " " " "
## 6 ( 1 ) "*" " " " " " "
## 7 ( 1 ) "*" " " " " " "
## 8 ( 1 ) "*" " " " " " "
## 9 ( 1 ) "*" " " " " " "
## 10 ( 1 ) "*" " " " " " "
## 11 ( 1 ) "*" " " " " " "
## 12 ( 1 ) "*" " " " " " "
## 13 ( 1 ) "*" " " " " " "
## 14 ( 1 ) "*" " " " " "*"
## genresMusic genresMystery genresRomance genresScience Fiction
## 1 ( 1 ) " " " " " " " "
## 2 ( 1 ) " " " " " " " "
## 3 ( 1 ) " " " " " " " "
## 4 ( 1 ) " " " " " " " "
## 5 ( 1 ) " " " " " " " "
## 6 ( 1 ) " " " " " " " "
## 7 ( 1 ) " " " " " " " "
## 8 ( 1 ) " " " " " " " "
## 9 ( 1 ) " " " " " " " "
## 10 ( 1 ) " " " " " " " "
## 11 ( 1 ) " " " " " " " "
## 12 ( 1 ) " " " " " " " "
## 13 ( 1 ) " " " " " " " "
## 14 ( 1 ) " " " " " " " "
## genresThriller genresWar genresWestern month2 month3 month4
## 1 ( 1 ) " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " " " "
## 4 ( 1 ) " " " " " " " " " " " "
## 5 ( 1 ) " " " " " " " " " " " "
## 6 ( 1 ) " " " " " " " " " " " "
## 7 ( 1 ) " " " " " " " " " " " "
## 8 ( 1 ) " " " " " " " " " " " "
## 9 ( 1 ) " " " " " " " " " " " "
## 10 ( 1 ) " " " " " " " " " " " "
## 11 ( 1 ) " " " " " " " " " " " "
## 12 ( 1 ) " " " " " " " " " " " "
## 13 ( 1 ) " " " " " " " " " " " "
## 14 ( 1 ) " " " " " " " " " " " "
## month5 month6 month7 month8 month9 month10 month11 month12 score
## 1 ( 1 ) " " " " " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " " " " " " " " " "
## 4 ( 1 ) " " "*" " " " " " " " " " " " " " "
## 5 ( 1 ) " " "*" " " " " " " " " " " " " " "
## 6 ( 1 ) " " "*" " " " " " " " " " " "*" " "
## 7 ( 1 ) "*" "*" " " " " " " " " " " "*" " "
## 8 ( 1 ) "*" "*" " " " " " " " " " " "*" " "
## 9 ( 1 ) "*" "*" " " " " " " " " " " "*" " "
## 10 ( 1 ) "*" "*" " " " " " " " " "*" "*" " "
## 11 ( 1 ) "*" "*" " " " " " " " " "*" "*" " "
## 12 ( 1 ) "*" "*" " " " " " " " " "*" "*" " "
## 13 ( 1 ) "*" "*" " " " " " " " " "*" "*" " "
## 14 ( 1 ) "*" "*" " " " " " " " " "*" "*" " "
## companyParamount Pictures companySony Pictures
## 1 ( 1 ) " " " "
## 2 ( 1 ) " " " "
## 3 ( 1 ) " " " "
## 4 ( 1 ) " " " "
## 5 ( 1 ) " " " "
## 6 ( 1 ) " " " "
## 7 ( 1 ) " " " "
## 8 ( 1 ) " " " "
## 9 ( 1 ) " " " "
## 10 ( 1 ) " " " "
## 11 ( 1 ) " " " "
## 12 ( 1 ) " " " "
## 13 ( 1 ) " " " "
## 14 ( 1 ) " " " "
## companyUniversal Pictures companyWalt Disney companyWarner Bros
## 1 ( 1 ) " " " " " "
## 2 ( 1 ) " " " " " "
## 3 ( 1 ) " " " " " "
## 4 ( 1 ) " " " " " "
## 5 ( 1 ) " " " " " "
## 6 ( 1 ) " " " " " "
## 7 ( 1 ) " " " " " "
## 8 ( 1 ) " " " " " "
## 9 ( 1 ) " " " " " "
## 10 ( 1 ) " " " " " "
## 11 ( 1 ) "*" " " " "
## 12 ( 1 ) "*" " " " "
## 13 ( 1 ) "*" "*" " "
## 14 ( 1 ) "*" "*" " "
movie_r$month <- as.factor(movie_r$month)
reg.best <- regsubsets(profit~. , data = movie_r, nvmax = 14, nbest = 1, method = "backward")
plot(reg.best, scale = "adjr2", main = "Adjusted R^2")
plot(reg.best, scale = "r2", main = "R^2")
# In the "leaps" package, we can use scale=c("bic","Cp","adjr2","r2")
plot(reg.best, scale = "bic", main = "BIC")
plot(reg.best, scale = "Cp", main = "Cp")
summary(reg.best)
## Subset selection object
## Call: regsubsets.formula(profit ~ ., data = movie_r, nvmax = 14, nbest = 1,
## method = "backward")
## 38 Variables (and intercept)
## Forced in Forced out
## budget FALSE FALSE
## popularity FALSE FALSE
## runtime FALSE FALSE
## vote FALSE FALSE
## genresAdventure FALSE FALSE
## genresAnimation FALSE FALSE
## genresComedy FALSE FALSE
## genresCrime FALSE FALSE
## genresDocumentary FALSE FALSE
## genresDrama FALSE FALSE
## genresFamily FALSE FALSE
## genresFantasy FALSE FALSE
## genresHistory FALSE FALSE
## genresHorror FALSE FALSE
## genresMusic FALSE FALSE
## genresMystery FALSE FALSE
## genresRomance FALSE FALSE
## genresScience Fiction FALSE FALSE
## genresThriller FALSE FALSE
## genresWar FALSE FALSE
## genresWestern FALSE FALSE
## month2 FALSE FALSE
## month3 FALSE FALSE
## month4 FALSE FALSE
## month5 FALSE FALSE
## month6 FALSE FALSE
## month7 FALSE FALSE
## month8 FALSE FALSE
## month9 FALSE FALSE
## month10 FALSE FALSE
## month11 FALSE FALSE
## month12 FALSE FALSE
## score FALSE FALSE
## companyParamount Pictures FALSE FALSE
## companySony Pictures FALSE FALSE
## companyUniversal Pictures FALSE FALSE
## companyWalt Disney FALSE FALSE
## companyWarner Bros FALSE FALSE
## 1 subsets of each size up to 14
## Selection Algorithm: backward
## budget popularity runtime vote genresAdventure genresAnimation
## 1 ( 1 ) " " " " " " "*" " " " "
## 2 ( 1 ) "*" " " " " "*" " " " "
## 3 ( 1 ) "*" " " " " "*" " " "*"
## 4 ( 1 ) "*" " " " " "*" " " "*"
## 5 ( 1 ) "*" " " " " "*" " " "*"
## 6 ( 1 ) "*" " " " " "*" " " "*"
## 7 ( 1 ) "*" " " " " "*" " " "*"
## 8 ( 1 ) "*" " " " " "*" " " "*"
## 9 ( 1 ) "*" " " " " "*" "*" "*"
## 10 ( 1 ) "*" " " " " "*" "*" "*"
## 11 ( 1 ) "*" " " " " "*" "*" "*"
## 12 ( 1 ) "*" "*" " " "*" "*" "*"
## 13 ( 1 ) "*" "*" " " "*" "*" "*"
## 14 ( 1 ) "*" "*" " " "*" "*" "*"
## genresComedy genresCrime genresDocumentary genresDrama
## 1 ( 1 ) " " " " " " " "
## 2 ( 1 ) " " " " " " " "
## 3 ( 1 ) " " " " " " " "
## 4 ( 1 ) " " " " " " " "
## 5 ( 1 ) " " " " " " " "
## 6 ( 1 ) " " " " " " " "
## 7 ( 1 ) " " " " " " " "
## 8 ( 1 ) "*" " " " " " "
## 9 ( 1 ) "*" " " " " " "
## 10 ( 1 ) "*" " " " " " "
## 11 ( 1 ) "*" " " " " " "
## 12 ( 1 ) "*" " " " " " "
## 13 ( 1 ) "*" " " " " " "
## 14 ( 1 ) "*" " " " " " "
## genresFamily genresFantasy genresHistory genresHorror
## 1 ( 1 ) " " " " " " " "
## 2 ( 1 ) " " " " " " " "
## 3 ( 1 ) " " " " " " " "
## 4 ( 1 ) " " " " " " " "
## 5 ( 1 ) "*" " " " " " "
## 6 ( 1 ) "*" " " " " " "
## 7 ( 1 ) "*" " " " " " "
## 8 ( 1 ) "*" " " " " " "
## 9 ( 1 ) "*" " " " " " "
## 10 ( 1 ) "*" " " " " " "
## 11 ( 1 ) "*" " " " " " "
## 12 ( 1 ) "*" " " " " " "
## 13 ( 1 ) "*" " " " " " "
## 14 ( 1 ) "*" " " " " "*"
## genresMusic genresMystery genresRomance genresScience Fiction
## 1 ( 1 ) " " " " " " " "
## 2 ( 1 ) " " " " " " " "
## 3 ( 1 ) " " " " " " " "
## 4 ( 1 ) " " " " " " " "
## 5 ( 1 ) " " " " " " " "
## 6 ( 1 ) " " " " " " " "
## 7 ( 1 ) " " " " " " " "
## 8 ( 1 ) " " " " " " " "
## 9 ( 1 ) " " " " " " " "
## 10 ( 1 ) " " " " " " " "
## 11 ( 1 ) " " " " " " " "
## 12 ( 1 ) " " " " " " " "
## 13 ( 1 ) " " " " " " " "
## 14 ( 1 ) " " " " " " " "
## genresThriller genresWar genresWestern month2 month3 month4
## 1 ( 1 ) " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " " " "
## 4 ( 1 ) " " " " " " " " " " " "
## 5 ( 1 ) " " " " " " " " " " " "
## 6 ( 1 ) " " " " " " " " " " " "
## 7 ( 1 ) " " " " " " " " " " " "
## 8 ( 1 ) " " " " " " " " " " " "
## 9 ( 1 ) " " " " " " " " " " " "
## 10 ( 1 ) " " " " " " " " " " " "
## 11 ( 1 ) " " " " " " " " " " " "
## 12 ( 1 ) " " " " " " " " " " " "
## 13 ( 1 ) " " " " " " " " " " " "
## 14 ( 1 ) " " " " " " " " " " " "
## month5 month6 month7 month8 month9 month10 month11 month12 score
## 1 ( 1 ) " " " " " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " " " " " " " " " "
## 4 ( 1 ) " " "*" " " " " " " " " " " " " " "
## 5 ( 1 ) " " "*" " " " " " " " " " " " " " "
## 6 ( 1 ) " " "*" " " " " " " " " " " "*" " "
## 7 ( 1 ) "*" "*" " " " " " " " " " " "*" " "
## 8 ( 1 ) "*" "*" " " " " " " " " " " "*" " "
## 9 ( 1 ) "*" "*" " " " " " " " " " " "*" " "
## 10 ( 1 ) "*" "*" " " " " " " " " "*" "*" " "
## 11 ( 1 ) "*" "*" " " " " " " " " "*" "*" " "
## 12 ( 1 ) "*" "*" " " " " " " " " "*" "*" " "
## 13 ( 1 ) "*" "*" " " " " " " " " "*" "*" " "
## 14 ( 1 ) "*" "*" " " " " " " " " "*" "*" " "
## companyParamount Pictures companySony Pictures
## 1 ( 1 ) " " " "
## 2 ( 1 ) " " " "
## 3 ( 1 ) " " " "
## 4 ( 1 ) " " " "
## 5 ( 1 ) " " " "
## 6 ( 1 ) " " " "
## 7 ( 1 ) " " " "
## 8 ( 1 ) " " " "
## 9 ( 1 ) " " " "
## 10 ( 1 ) " " " "
## 11 ( 1 ) " " " "
## 12 ( 1 ) " " " "
## 13 ( 1 ) " " " "
## 14 ( 1 ) " " " "
## companyUniversal Pictures companyWalt Disney companyWarner Bros
## 1 ( 1 ) " " " " " "
## 2 ( 1 ) " " " " " "
## 3 ( 1 ) " " " " " "
## 4 ( 1 ) " " " " " "
## 5 ( 1 ) " " " " " "
## 6 ( 1 ) " " " " " "
## 7 ( 1 ) " " " " " "
## 8 ( 1 ) " " " " " "
## 9 ( 1 ) " " " " " "
## 10 ( 1 ) " " " " " "
## 11 ( 1 ) "*" " " " "
## 12 ( 1 ) "*" " " " "
## 13 ( 1 ) "*" "*" " "
## 14 ( 1 ) "*" "*" " "
## Abbreviation
## budget b
## popularity p
## runtime r
## vote v
## genresAdventure gnrsAd
## genresAnimation gnrsAn
## genresComedy gnrsCm
## genresCrime gnrsCr
## genresDocumentary gnrsDc
## genresDrama gnrsDr
## genresFamily gnrsFm
## genresFantasy gnrsFn
## genresHistory gnrsHs
## genresHorror gnrsHr
## genresMusic gnrsMs
## genresMystery gnrsMy
## genresRomance gR
## genresScience Fiction gSF
## genresThriller gT
## genresWar gnrsWr
## genresWestern gnrsWs
## month2 m2
## month3 m3
## month4 m4
## month5 m5
## month6 m6
## month7 m7
## month8 m8
## month9 m9
## month10 m10
## month11 m11
## month12 m12
## score s
## companyParamount Pictures cPP
## companySony Pictures cSP
## companyUniversal Pictures cUP
## companyWalt Disney cD
## companyWarner Bros cB
According to the plots above, we know that predictors (budget, vote, month5, month6,month12, genre Animation, and genre family) form the best model.
##
## Call:
## lm(formula = profit ~ budget + vote + M5 + M6 + M12 + GAnimation +
## GFamily, data = movie_OH)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.016 -0.256 -0.016 0.161 9.939
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.0655 0.0135 -4.85 1.3e-06 ***
## budget 0.1853 0.0137 13.55 < 2e-16 ***
## vote 0.6236 0.0134 46.53 < 2e-16 ***
## M5 0.1705 0.0433 3.93 8.5e-05 ***
## M6 0.2104 0.0402 5.23 1.8e-07 ***
## M12 0.1500 0.0374 4.01 6.3e-05 ***
## GAnimation 0.4053 0.0665 6.10 1.2e-09 ***
## GFamily 0.4635 0.1047 4.43 9.9e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.64 on 3217 degrees of freedom
## Multiple R-squared: 0.592, Adjusted R-squared: 0.591
## F-statistic: 665 on 7 and 3217 DF, p-value: <2e-16
## [1] 38 201
We try to find a value for lambda that is optimal. And it turns that the best lambda value is 0.032.
With lambda equal to 0.032, we have the r square to be 0.604, which is slightly better than the linear model we have. Since the best lambda value is close to 0, the original linear regression model is not overfit.
## [1] 1.26
## (Intercept) companyParamount Pictures
## -0.03255 0.05064
## companySony Pictures companyUniversal Pictures
## 0.01068 0.08523
## companyWalt Disney companyWarner Bros
## 0.08018 0.00256
## month2 month3
## -0.05031 -0.02952
## month4 month5
## 0.00378 0.09768
## month6 month7
## 0.11181 0.01346
## month8 month9
## -0.04168 -0.06057
## month10 month11
## -0.05497 0.05325
## month12 genresAdventure
## 0.05378 0.11871
## genresAnimation genresComedy
## 0.23731 -0.00308
## genresCrime genresDocumentary
## -0.07895 -0.02050
## genresDrama genresFamily
## -0.06092 0.17305
## genresFantasy genresHistory
## 0.01646 -0.01602
## genresHorror genresMusic
## -0.00571 -0.04809
## genresMystery genresRomance
## -0.02937 0.00378
## genresScience Fiction genresThriller
## 0.04979 -0.04919
## genresWar genresWestern
## -0.14120 -0.11157
## popularity vote
## 0.14075 0.22603
## score budget
## 0.04979 0.14449
## [1] 0.55
## [1] 1.26
## (Intercept) companyParamount Pictures
## -0.03255 0.05064
## companySony Pictures companyUniversal Pictures
## 0.01068 0.08523
## companyWalt Disney companyWarner Bros
## 0.08018 0.00256
## month2 month3
## -0.05031 -0.02952
## month4 month5
## 0.00378 0.09768
## month6 month7
## 0.11181 0.01346
## month8 month9
## -0.04168 -0.06057
## month10 month11
## -0.05497 0.05325
## month12 genresAdventure
## 0.05378 0.11871
## genresAnimation genresComedy
## 0.23731 -0.00308
## genresCrime genresDocumentary
## -0.07895 -0.02050
## genresDrama genresFamily
## -0.06092 0.17305
## genresFantasy genresHistory
## 0.01646 -0.01602
## genresHorror genresMusic
## -0.00571 -0.04809
## genresMystery genresRomance
## -0.02937 0.00378
## genresScience Fiction genresThriller
## 0.04979 -0.04919
## genresWar genresWestern
## -0.14120 -0.11157
## popularity vote
## 0.14075 0.22603
## score budget
## 0.04979 0.14449
## [1] 1.14
## [1] 0.603
## companyWarner Bros month7 month8
## 0 0 0
## genresFantasy genresMystery genresScience Fiction
## 0 0 0
## score
## 0
Here, we see that the lowest MSE is when \(\lambda\) appro = . It has 31 non-zero coefficients.
#str(movies_rf)
# install.packages('caTools')
library(caTools)
movie_RF <- subset(movies,select = c(company,genres,month,runtime,budget,popularity,score,vote))
set.seed(123)
split = sample.split(movie_RF, SplitRatio = 0.8)
training_set = subset(movie_RF, split == TRUE)
test_set = subset(movie_RF, split == FALSE)
RFR = randomForest(x = movie_RF,
y = movie$profit,
ntree = 500)
print(RFR)
##
## Call:
## randomForest(x = movie_RF, y = movie$profit, ntree = 500)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 2
##
## Mean of squared residuals: 8.92e+15
## % Var explained: 64.4
The random forest regression model has the highest r-squared value: 0.639 with 500 trees.
plot(RFR)
It turns out that 500 hundred trees is enough to build the model.
#summary(regressor)
# Predicting a new result with Random Forest Regression
Mulan <- data.frame(company = 'Walt Disney',genres = 'Animation',month = 3, runtime = 100 ,budget = 25000000, popularity = 165.13, score = 9.4,vote = 4566)
#levels(Mulan$company) <- regressor$forest$xlevels$company
#levels(Mulan$genre) <- regressor$forest$xlevels$genre
#Mulan_profit = predict(regressor, Mulan)
#Set the populartiy as same as the movie forzen's
Frozen2a <- data.frame(company = 'Walt Disney',genres = 'Animation',month = 11, runtime = 104 ,budget = 33000000, popularity = 165.13, score = 6.79,vote = 5295)
levels(Frozen2a$company) <- RFR$forest$xlevels$company
levels(Frozen2a$genres) <- RFR$forest$xlevels$genres
Frozen2_profita = predict(RFR, Frozen2a)
Frozen <- data.frame(company = 'Walt Disney',genres = 'Animation',month = 11, runtime = 104 ,budget = 150000000, popularity = 165.13, score = 7.3,vote = 5295)
levels(Frozen$company) <- RFR$forest$xlevels$company
levels(Frozen$genres) <- RFR$forest$xlevels$genres
Frozen_profit = predict(RFR, Frozen)
We want to examine if Principal Component Analysis method is good at dimensional reduction. In our data, we have 5 continuous variables to predict the revenue. We will perform principal component regression directly and see how many components are sufficient for the model.
In this part, we will also clarify the reason to scale our data in previous chapter by comparing two PCR models of different versions: centered data and non-centered data.
PCAxform <- function(df, z=TRUE) {
#' Obtain the dataframe with the Principal Components after the rotation.
#' ELo 201903 GWU DATS
#' @param df The dataframe.
#' @param z T/F or 0/1 for z-score to be used
#' @return The transformed dataframe.
#' @examples
#' tmp = PCAxform(USArrests,TRUE)
z = ifelse(z==TRUE || z=="true" || z=="True" || z=="T" || z=="t" || z==1 || z=="1", TRUE, FALSE) # standardize z
if(z) { df = data.frame(scale(df))} # scale not safe for non-numeric colunms, but PCA requires all variables numerics to begin with.
nmax = length(df)
pr.out = prcomp(df,scale=z)
df1 = data.frame()
cnames = c()
for( i in 1:nmax ) {
vec = 0
cnames = c( cnames, paste("PC",i, sep="") )
for( j in 1:nmax ) { vec = vec + pr.out$rotation[j,i]*df[,j] }
if( length(df1)>0 ) { df1 = data.frame(df1,vec) } else { df1 = data.frame(vec) }
}
colnames(df1) <- cnames
return(df1)
}
# To-be-implemented: for z=TRUE, it will be better to have the z-scaling option for x-vars and y separately. It is actually convenient keep y in original units.
PCRxform <- function(df, y, zX=TRUE, zy=FALSE) {
#' Obtain the dataframe with the Principal Components after the rotation for PCRegression. Requires related function PCAxform()
#' ELo 201903 GWU DATS
#' @param df The dataframe.
#' @param y The y-variable column index number(int), or the name of y-variable
#' @param zX T/F or 0/1 for z-score used on X-variables
#' @param zy T/F or 0/1 for z-score used on the target y-variable
#' @return The transformed dataframe.
#' @examples
# take care of y target
zy = ifelse(zy==TRUE || zy=="true" || zy=="True" || zy=="T" || zy=="t" || zy==1 || zy=="1", TRUE, FALSE) # standardize target y
if( is.integer(y) ) { # y is integer
if( y>length(df) || y<1 ) {
print("Invalid column number")
return(NULL)
}
if(zy) { df1 = data.frame( scale(df[y]) ) } else { df1 = df[y] } # save y-var in df1
df = df[-y] # remove y-variable in df
} else { # y is not integer, so interpret as name
if(zy) { df1 = data.frame( scale( df[names(df) == y] ) ) } else { df1 = df[names(df) == y] }
df = df[names(df) != y] # remove y-variable in df
}
if( length(df1)<1 ) {
print("Variable name not found in data.frame")
return(NULL)
}
# now transform X-vars
zX = ifelse(zX==TRUE || zX=="true" || zX=="True" || zX=="T" || zX=="t" || zX==1 || zX=="1", TRUE, FALSE) # standardize X-vars
df2 = PCAxform(df,zX)
df1 = data.frame(df1,df2) # piece them back together
return(df1)
}
loadPkg("pls")
loadPkg("mice")
#loadPkg("ISLR")
pr = prcomp(movie_num[c(1:6)] , scale =FALSE)
pr_scale = prcomp(movie_num[c(1:6)], scale =TRUE)
We will also include revenue when checking variance.
Non-centered data
summary(pr)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.89e+08 3.10e+07 926 23.9 20.1 0.699
## Proportion of Variance 9.74e-01 2.62e-02 0 0.0 0.0 0.000
## Cumulative Proportion 9.74e-01 1.00e+00 1 1.0 1.0 1.000
#pr$rotation
Centered data
summary(pr_scale)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.768 1.108 0.894 0.6466 0.5045 0.4186
## Proportion of Variance 0.521 0.204 0.133 0.0697 0.0424 0.0292
## Cumulative Proportion 0.521 0.725 0.859 0.9284 0.9708 1.0000
#pr_scale$rotation
We see a significant differences in the variances and standard deviations of different components. Actually, the revenue and our predictors except budget are measured by different scales. And, the revenue and budget are measured in millions which are significantly bigger than the scales of vote, score, popularity …
Therefore, we recommend to scale the data before constructing the model. We will see the differences between non-centered and centered data in the following models.
pr.var <- (pr$sdev^2)
pve <- pr.var/sum(pr.var)
plot(cumsum(pve), xlab="Principal Component (standardized)", ylab ="Cumulative Proportion of Variance Explained",ylim=c(0,1),type="b", main ="Non-centered version")
pr_scale.var <- (pr_scale$sdev^2)
pve_scale <- pr_scale.var/sum(pr_scale.var)
plot(cumsum(pve_scale), xlab="Principal Component (non-standardized)", ylab ="Cumulative Proportion of Variance Explained",ylim=c(0,1),type="b", main ="Centered version")
In non-centered version, 1 component explains almost 100% of the variance. In centered version, 1 component only explains about 50% of the variance. To reach 80%, we need 3 components. Another evidence to indicate that we should perform scaling since the the revenue and budget seem to overwhelm the components in non-centered version, causing the PC1 to capture nearly the entire variance.
# we used the train and test sets in linear regression model
# at first we use the PCRxform function (Prof. EL) to scale and rotate
# training set
# we do not scale the response revenue but scale all predictors
movie_pcr = PCRxform(train1[c(1:6)],"revenue",TRUE,FALSE) # scaled predictors
#movie_pcr
# testing set
movie_pcr_test = PCRxform(test1[c(1:6)],"revenue",TRUE,FALSE)
#movie_pcr_test
# non-centered version, nothing is scaled
movie_pcr.nc = PCRxform(train1[c(1:6)],"revenue",FALSE, FALSE) # non-scaled predictors
#movie_pcr.nc
movie_pcr.nc_test = PCRxform(test1[c(1:6)],"revenue",FALSE, FALSE)
#movie_pcr.nc_test
# scaled data
pcr.fit <- pcr(revenue~.,data=movie_pcr,scale=FALSE,validation="CV") # build the model, as we scaled data before we use scale = F here
# non-scaled data
pcr.fit.nc <- pcr(revenue~.,data=movie_pcr.nc,scale=FALSE,validation="CV")
Non-centered version
validationplot(pcr.fit.nc,val.type="MSEP", legend="topright")
validationplot(pcr.fit.nc,val.type="R2")
Centered version
validationplot(pcr.fit,val.type="MSEP", legend="topright") # validation with MSEP & R2
validationplot(pcr.fit,val.type="R2")
Both versions agree that 2 components give the optimized MSEP and R2.
We can see the coefficients of different components.
coefplot(pcr.fit, intercept=TRUE) # plot coefficients of different components
# position 1 is the intercept, the first PC starts at position 2
We can see that after PC2, the change of coefficients is not significant comparing to the difference between the coeffcients of PC1 and PC2. Hence, we can use PC1 and PC2 to optimize the model since two components are sufficient to capture the variance.
We can make a comparision between non-centered and centered version.
Non-centered version
# we can see the summary of the model
summary(pcr.fit.nc)
## Data: X dimension: 2176 5
## Y dimension: 2176 1
## Fit method: svdpc
## Number of components considered: 5
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps
## CV 1.88e+08 122151898 108524196 107141642 102852443 102965230
## adjCV 1.88e+08 122077691 108448087 107128888 102792895 102854326
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps
## X 49.00 71.71 87.31 95.63 100.00
## revenue 57.87 66.61 67.94 70.47 71.13
summary(pcr.fit)
## Data: X dimension: 2176 5
## Y dimension: 2176 1
## Fit method: svdpc
## Number of components considered: 5
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps
## CV 1.88e+08 121720344 106321914 1.06e+08 103819563 102861273
## adjCV 1.88e+08 121665663 106238340 1.06e+08 103733078 102758138
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps
## X 48.50 71.69 87.41 95.61 100.00
## revenue 58.21 68.09 68.67 70.49 71.13
Scaled data have better variance explanation for revenue than non-scaled data.
Let’s try the pcr model on testing data. We will use the centered version.
variance.pcr = c(1:5) # create a vector to store the variances for different numbers of components
for (i in 1:5) {
pcr_pred <- predict(pcr.fit, movie_pcr_test , ncomp = i); # predicting the PCR model on the testing set
# calculate the variance by returning the mean of [predicted value - mean(actual values)] ^2
variance.pcr[i] = mean((pcr_pred - mean(movie_pcr_test$revenue))^2)
# calculate the variance by taking mean of( (predicted value - mean(actual values) ^2 )
}
var.pcr.df = as.data.frame(cbind(variance=variance.pcr, components = c(1:5)))
#pred_act <- data.frame(cbind(pcr_pred, movie_pcr_test$revenue))
ggplot(data=var.pcr.df) +
geom_line(aes(x = components, y = variance)) +
geom_point(aes(x = components, y = variance)) +
theme_light()
There is a significant increase in the variance from PC1 to PC2, after that the change of variance is not too drastic. We can say that 2 principal components can capture the majority of variance in the testing data. Our PCR model seems to perform properly in the testing data.
We can use principal components as the predictors for a linear model. We will use two components to build the models with two versions: centered and non-centered.
Non-centered version
pcr_lm5 <- lm(revenue ~ PC1 + PC2 , data = movie_pcr.nc) # linear model with PC1 and PC2
summary(pcr_lm5)
##
## Call:
## lm(formula = revenue ~ PC1 + PC2, data = movie_pcr.nc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.20e+09 -4.08e+07 -5.95e+06 2.28e+07 1.76e+09
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 121493614 2330120 52.1 <2e-16 ***
## PC1 -89813816 1463518 -61.4 <2e-16 ***
## PC2 51258272 2149532 23.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.09e+08 on 2173 degrees of freedom
## Multiple R-squared: 0.666, Adjusted R-squared: 0.666
## F-statistic: 2.17e+03 on 2 and 2173 DF, p-value: <2e-16
vif(pcr_lm5)
## PC1 PC2
## 1 1
Centered version
pcr_lm4 <- lm(revenue ~ PC1 + PC2 , data = movie_pcr)
summary(pcr_lm4)
##
## Call:
## lm(formula = revenue ~ PC1 + PC2, data = movie_pcr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.11e+09 -4.03e+07 -4.95e+06 2.42e+07 1.73e+09
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 120012282 2277573 52.7 <2e-16 ***
## PC1 -92108152 1462914 -63.0 <2e-16 ***
## PC2 54901946 2115458 25.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.06e+08 on 2173 degrees of freedom
## Multiple R-squared: 0.681, Adjusted R-squared: 0.681
## F-statistic: 2.32e+03 on 2 and 2173 DF, p-value: <2e-16
vif(pcr_lm4)
## PC1 PC2
## 1 1
All vifs are 1, nice feature from PCA method.
The scaled version gives better results than non-scaled version.
We can also use AIC and BIC for validation between non-scaled and scaled versions.
AIC(object = pcr_lm4)
## [1] 86611
BIC(object = pcr_lm4)
## [1] 86633
AIC(object = pcr_lm5)
## [1] 86710
BIC(object = pcr_lm5)
## [1] 86732
#str(movie)
if (require("pls")) {detach("package:pls", unload = T) # for some reason package "pls" made the corrplot not proper so I detach it after using
}
Both AIC and BIC agree that the scaled version is better.
prd_pcr <- predict(pcr_lm4, newdata=movie_pcr_test)
regr.eval(movie_pcr_test$revenue, prd_pcr)
## mae mse rmse mape
## 1.13e+08 2.41e+16 1.55e+08 7.21e+04
Comparing to the linear model (of numerical variables) in previous chapter, the Adjusted R-squared slightly decreases. The value drops from 71.1% to 68.1%. It is in our expectation since the goal of PCA is to reduce dimensionality. Instead of using 5 variables (budget + popularity + vote + score + runtime + ), we need only 2 variables (PC1 and PC2). We can speed up the computational process while not significantly hurting the performance of the model
In this chapter, we use different kinds of model to predict if a movie earns profit or not (box office revenue > budget -> earn profit). In our data, we use column profitable to record whether a movie earns profit or not; a movie with revenue > budget is labeled as 1, otherwise it is labeled as 0.
We use three models in this section: Logistic Regression, Classification Tree and KNN.
Genres
# Prior check the dependence of genres and profitable
contable1 <- table(movie$genres, movie$profitable)
chisqres1 = chisq.test(contable1)
chisqres1
##
## Pearson's Chi-squared test
##
## data: contable1
## X-squared = 54, df = 17, p-value = 1e-05
Company
# Prior check the dependence of company and profitable
contable2 <- table(movie$company, movie$profitable)
chisqres2 = chisq.test(contable2)
chisqres2
##
## Pearson's Chi-squared test
##
## data: contable2
## X-squared = 93, df = 5, p-value <2e-16
Season
# Prior check the dependence of season and profitable
contable3 <- table(movie$season, movie$profitable)
chisqres3 = chisq.test(contable3)
chisqres3
##
## Pearson's Chi-squared test
##
## data: contable3
## X-squared = 20, df = 3, p-value = 1e-04
In this part we will construct the logit model on the whole dataset.
# we do some preparation with the data for our logit model
movie_nd <- movie_num # duplicate the dataframe of numerical variables
# append back the other columns
movie_nd$genres = movie$genres
movie_nd$company = movie$company
movie_nd$season = movie$season
movie_nd$y = movie$profitable # profitable column (negative profit = 0, positive profit = 1) is saved as y
str(movie_nd)
## 'data.frame': 3225 obs. of 10 variables:
## $ revenue : num 2.79e+09 9.61e+08 8.81e+08 1.08e+09 2.84e+08 ...
## $ budget : int 237000000 300000000 245000000 250000000 260000000 258000000 260000000 280000000 250000000 250000000 ...
## $ popularity: num 150.4 139.1 107.4 112.3 43.9 ...
## $ runtime : num 162 169 148 165 132 139 100 141 153 151 ...
## $ score : num 7.2 6.9 6.3 7.6 6.1 5.9 7.4 7.3 7.4 5.7 ...
## $ vote : int 11800 4500 4466 9106 2124 3576 3330 6767 5293 7004 ...
## $ genres : Factor w/ 18 levels "Action","Adventure",..: 1 2 1 1 1 9 3 1 2 1 ...
## $ company : Factor w/ 6 levels "Others","Paramount Pictures",..: 1 5 3 1 5 3 5 5 6 6 ...
## $ season : Factor w/ 4 levels "Spring","Summer",..: 4 1 3 2 1 1 3 1 2 1 ...
## $ y : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
# split the train and test sets with ratio 50:50
#set.seed(1)
#movie_sample1 <- sample(2, nrow(movie_scale), replace=TRUE, prob=c(0.50, 0.50))
# create train and test sets
#train2 <- movie_scale[movie_sample1==1, 1:9]
#test2 <- movie_scale[movie_sample1==2, 1:9]
Budget and revenue are enough to decide the profitable as the profit is calculated as revenue subtracted by budget. Our pre-test with “bestglm” also shows the same result.
loadPkg("bestglm")
res.bestglm0 <- bestglm(Xy = movie_nd, family = binomial,
IC = "AIC", # Information criteria for
method = "exhaustive")
#summary(res.bestglm)
res.bestglm0$BestModels
## revenue budget popularity runtime score vote genres company season
## 1 TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2 TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## 3 TRUE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## 4 TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE TRUE
## 5 TRUE TRUE FALSE FALSE FALSE TRUE FALSE FALSE TRUE
## Criterion
## 1 17.4
## 2 17.6
## 3 18.1
## 4 18.3
## 5 18.5
However, since the relationship between (revenue + budget) and profitable is too direct, we should not use them together.
In reality, we prefer budget rather than revenue to predict profit. A film manager would want to have a prediction of the profit of a movie before its main released date. The information he/she have are the budget, runtime, genres, production company, popularity, vote and score (vote and score can be obtained by a preview screening of a movie, popularity can be generated after advertisement, trailers and some leaks from a movie). Revenue should play a role as the response in the model rather than a predictor.
Let’s try the model with budget and other predictors
# we will use the same train and test sets as in previous chapters
# remove revenue column and use other predictors in train and test sets
train3 <- movie_nd[movie_sample == 1, 2:10] # revenue column is 1, we do not include it
test3 <- movie_nd[movie_sample == 2, 2:10]
dim(train3)
## [1] 2176 9
dim(test3)
## [1] 1049 9
prf_glm <- glm(y ~ ., data = train3, family = "binomial")
summary(prf_glm)
##
## Call:
## glm(formula = y ~ ., family = "binomial", data = train3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.683 0.000 0.297 0.735 1.743
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.05e+00 5.35e-01 -3.84 0.00012 ***
## budget -1.75e-08 2.51e-09 -6.99 2.8e-12 ***
## popularity 1.61e-02 1.26e-02 1.28 0.20159
## runtime 1.66e-03 3.28e-03 0.50 0.61421
## score 2.27e-01 8.46e-02 2.68 0.00739 **
## vote 2.67e-03 4.49e-04 5.95 2.7e-09 ***
## genresAdventure 1.73e-01 2.51e-01 0.69 0.49015
## genresAnimation 2.70e-01 3.97e-01 0.68 0.49646
## genresComedy 4.31e-01 1.91e-01 2.26 0.02360 *
## genresCrime 7.35e-02 2.96e-01 0.25 0.80411
## genresDocumentary 4.66e-01 5.25e-01 0.89 0.37479
## genresDrama 7.25e-02 1.91e-01 0.38 0.70424
## genresFamily 2.31e-01 5.54e-01 0.42 0.67656
## genresFantasy 2.28e-01 4.32e-01 0.53 0.59765
## genresHistory 6.55e-01 7.13e-01 0.92 0.35873
## genresHorror 7.92e-01 3.16e-01 2.51 0.01212 *
## genresMusic 2.74e-01 7.09e-01 0.39 0.69916
## genresMystery -4.23e-01 6.33e-01 -0.67 0.50393
## genresRomance 8.50e-01 4.26e-01 2.00 0.04590 *
## genresScience Fiction 2.64e-01 5.05e-01 0.52 0.60148
## genresThriller -1.50e-01 3.29e-01 -0.46 0.64724
## genresWar -1.36e+00 8.29e-01 -1.64 0.10019
## genresWestern 2.27e+00 1.08e+00 2.10 0.03582 *
## companyParamount Pictures 9.81e-01 2.43e-01 4.03 5.5e-05 ***
## companySony Pictures 6.20e-01 2.22e-01 2.79 0.00524 **
## companyUniversal Pictures 8.52e-01 2.34e-01 3.64 0.00027 ***
## companyWalt Disney 8.60e-01 1.84e-01 4.67 3.0e-06 ***
## companyWarner Bros 7.69e-01 2.45e-01 3.13 0.00172 **
## seasonSummer 3.90e-01 1.74e-01 2.24 0.02505 *
## seasonFall -4.65e-02 1.62e-01 -0.29 0.77361
## seasonWinter 7.04e-02 1.69e-01 0.42 0.67734
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2409.2 on 2175 degrees of freedom
## Residual deviance: 1805.0 on 2145 degrees of freedom
## AIC: 1867
##
## Number of Fisher Scoring iterations: 8
exp(coef(prf_glm))[24:28] # company
## companyParamount Pictures companySony Pictures
## 2.67 1.86
## companyUniversal Pictures companyWalt Disney
## 2.34 2.36
## companyWarner Bros
## 2.16
exp(coef(prf_glm))[7:23] # genres
## genresAdventure genresAnimation genresComedy
## 1.189 1.310 1.539
## genresCrime genresDocumentary genresDrama
## 1.076 1.594 1.075
## genresFamily genresFantasy genresHistory
## 1.260 1.256 1.925
## genresHorror genresMusic genresMystery
## 2.208 1.315 0.655
## genresRomance genresScience Fiction genresThriller
## 2.340 1.302 0.860
## genresWar genresWestern
## 0.256 9.707
exp(coef(prf_glm))[29:31] # seasons
## seasonSummer seasonFall seasonWinter
## 1.478 0.955 1.073
#length(coef(prf_glm))
We can test the effects of different genres/companies/seasons on the prediction to see whether they are significant.
loadPkg("aod") # Analysis of Overdispersed Data, used wald.test in logit example
wald.test(b = coef(prf_glm), Sigma = vcov(prf_glm), Terms = 7:23)
## Wald test:
## ----------
##
## Chi-squared test:
## X2 = 23.8, df = 17, P(> X2) = 0.12
It seems that different genres do not have significant effects on the response in our logit model.
wald.test(b = coef(prf_glm), Sigma = vcov(prf_glm), Terms = 24:28)
## Wald test:
## ----------
##
## Chi-squared test:
## X2 = 46.2, df = 5, P(> X2) = 8.4e-09
The effects of different companies are significant.
wald.test(b = coef(prf_glm), Sigma = vcov(prf_glm), Terms = 29:31)
## Wald test:
## ----------
##
## Chi-squared test:
## X2 = 8.0, df = 3, P(> X2) = 0.045
The effects of different seasons are significant, but not as clear as the effects of different companies.
#drop revenue
res.bestglm <- bestglm(Xy = train3, family = binomial,
IC = "AIC", # Information criteria for
method = "exhaustive")
#summary(res.bestglm)
res.bestglm$BestModels
## budget popularity runtime score vote genres company season Criterion
## 1 TRUE FALSE FALSE TRUE TRUE FALSE TRUE TRUE 1855
## 2 TRUE TRUE FALSE TRUE TRUE FALSE TRUE TRUE 1856
## 3 TRUE FALSE TRUE TRUE TRUE FALSE TRUE TRUE 1857
## 4 TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE 1858
## 5 TRUE FALSE FALSE TRUE TRUE FALSE TRUE FALSE 1858
#summary(res.bestglm$BestModels)
Build the best model
prf_glm0 <- glm(y ~ budget + score + vote + company + season, data = train3, family = "binomial")
summary(prf_glm0)
##
## Call:
## glm(formula = y ~ budget + score + vote + company + season, family = "binomial",
## data = train3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.733 0.000 0.306 0.763 1.770
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.44e+00 4.57e-01 -3.15 0.00163 **
## budget -1.83e-08 2.30e-09 -7.95 1.9e-15 ***
## score 2.10e-01 7.18e-02 2.93 0.00340 **
## vote 3.18e-03 2.37e-04 13.43 < 2e-16 ***
## companyParamount Pictures 9.64e-01 2.40e-01 4.02 5.7e-05 ***
## companySony Pictures 5.48e-01 2.19e-01 2.50 0.01237 *
## companyUniversal Pictures 8.42e-01 2.30e-01 3.67 0.00024 ***
## companyWalt Disney 8.75e-01 1.80e-01 4.86 1.2e-06 ***
## companyWarner Bros 7.88e-01 2.41e-01 3.27 0.00109 **
## seasonSummer 3.84e-01 1.72e-01 2.24 0.02513 *
## seasonFall -6.93e-02 1.59e-01 -0.43 0.66375
## seasonWinter 5.37e-02 1.66e-01 0.32 0.74708
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2409.2 on 2175 degrees of freedom
## Residual deviance: 1832.9 on 2164 degrees of freedom
## AIC: 1857
##
## Number of Fisher Scoring iterations: 7
#AIC(prf_glm0) # we can check AIC and BIC
#BIC(prf_glm0)
exp(coef(prf_glm0))
## (Intercept) budget
## 0.237 1.000
## score vote
## 1.234 1.003
## companyParamount Pictures companySony Pictures
## 2.623 1.729
## companyUniversal Pictures companyWalt Disney
## 2.322 2.399
## companyWarner Bros seasonSummer
## 2.200 1.469
## seasonFall seasonWinter
## 0.933 1.055
We can validate the model on the testing set with following methods:
prof_glm_pred = predict(object = prf_glm0, test3, type = c("response")) # predict the model on testing data and return predicted probabilities
loadPkg("ResourceSelection") # function hoslem.test( ) for logit model evaluation
# best model
prf_Hos0 = hoslem.test(test3$y, prof_glm_pred) # Hosmer and Lemeshow test, a chi-squared test
prf_Hos0
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: test3$y, prof_glm_pred
## X-squared = 1049, df = 8, p-value <2e-16
Low p-value. Both models seem to be a good fit.
loadPkg("pROC")
h0 <- roc(test3$y ~ prof_glm_pred) # using the model on testing data and see the ROC curve and AUC
auc(h0) # area-under-curve prefer 0.8 or higher.
## Area under the curve: 0.849
plot(h0)
title("ROC curve")
The area under the curve is more than 0.80. This test also agrees with the Hosmer and Lemeshow test.
loadPkg("pscl") # use pR2( ) function to calculate McFadden statistics for model eval
prf_mcFadden = pR2(prf_glm0)
prf_mcFadden
## llh llhNull G2 McFadden r2ML r2CU
## -916.473 -1204.608 576.271 0.239 0.233 0.348
23.9% the variance in y is explained by the predictors in our model. Not so bad but not so good.
#confusion matrix
loadPkg("caret")
logit_accuracy <- c(1:5)
logit_kappa <- c(1:5)
threshold_logit <- c(0.5, 0.6, 0.7, 0.8, 0.9) # set threshold for predicted probabilities
j = 1
for (i in c(0.5, 0.6, 0.7, 0.8, 0.9)) {
conf_matrix <- confusionMatrix(data = as.factor(as.integer(prof_glm_pred>i)), reference = test3$y); # using the model on the testing data
logit_accuracy[j] <- conf_matrix$overall[1]*100;
logit_kappa[j] <- conf_matrix$overall[2]
j = j+1
}
# we can see the results at threshold = 0.9 as an example
confusionMatrix(data = as.factor(as.integer(prof_glm_pred>0.54)), reference = test3$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 130 59
## 1 130 730
##
## Accuracy : 0.82
## 95% CI : (0.795, 0.843)
## No Information Rate : 0.752
## P-Value [Acc > NIR] : 9.29e-08
##
## Kappa : 0.468
##
## Mcnemar's Test P-Value : 3.55e-07
##
## Sensitivity : 0.500
## Specificity : 0.925
## Pos Pred Value : 0.688
## Neg Pred Value : 0.849
## Prevalence : 0.248
## Detection Rate : 0.124
## Detection Prevalence : 0.180
## Balanced Accuracy : 0.713
##
## 'Positive' Class : 0
##
# combine results into a dataframe
logit_prediction <- as.data.frame(cbind(threshold = threshold_logit, accuracy = logit_accuracy, kappa = logit_kappa))
logit_prediction
## threshold accuracy kappa
## 1 0.5 81.9 0.425
## 2 0.6 80.7 0.486
## 3 0.7 77.4 0.476
## 4 0.8 69.0 0.379
## 5 0.9 59.2 0.280
loadPkg("ISLR")
loadPkg("tree")
loadPkg("rpart")
loadPkg("rpart.plot")
loadPkg("rattle") # For fancyRpartPlot (Trees) Answer "no" on installing from binary source
# we use the same training and testing data so that we can compare Classification Tree to Logit Regression
prf_dt <- rpart(y~., method = "class", data=train3) #rpart to build our tree
summary(prf_dt) # detailed summary of splits
## Call:
## rpart(formula = y ~ ., data = train3, method = "class")
## n= 2176
##
## CP nsplit rel error xerror xstd
## 1 0.0759 0 1.000 1.000 0.0379
## 2 0.0266 2 0.848 0.863 0.0360
## 3 0.0202 4 0.795 0.869 0.0361
## 4 0.0114 7 0.734 0.822 0.0353
## 5 0.0100 8 0.723 0.829 0.0355
##
## Variable importance
## vote popularity budget genres score company
## 41 32 12 5 5 4
## runtime
## 1
##
## Node number 1: 2176 observations, complexity param=0.0759
## predicted class=1 expected loss=0.242 P(node) =1
## class counts: 527 1649
## probabilities: 0.242 0.758
## left son=2 (744 obs) right son=3 (1432 obs)
## Primary splits:
## vote < 260 to the left, improve=119.0, (0 missing)
## popularity < 15.2 to the left, improve=111.0, (0 missing)
## score < 5.35 to the left, improve= 31.0, (0 missing)
## company splits as LRRRRR, improve= 21.9, (0 missing)
## budget < 83500000 to the left, improve= 13.2, (0 missing)
## Surrogate splits:
## popularity < 14.1 to the left, agree=0.938, adj=0.820, (0 split)
## budget < 10100000 to the left, agree=0.696, adj=0.110, (0 split)
## score < 5.15 to the left, agree=0.691, adj=0.097, (0 split)
## genres splits as RRRRRLRRRRRLRRRRRR, agree=0.666, adj=0.024, (0 split)
## runtime < 86.5 to the left, agree=0.663, adj=0.015, (0 split)
##
## Node number 2: 744 observations, complexity param=0.0759
## predicted class=1 expected loss=0.472 P(node) =0.342
## class counts: 351 393
## probabilities: 0.472 0.528
## left son=4 (364 obs) right son=5 (380 obs)
## Primary splits:
## budget < 14100000 to the right, improve=27.20, (0 missing)
## vote < 95.5 to the left, improve=13.00, (0 missing)
## popularity < 9.44 to the left, improve=12.00, (0 missing)
## score < 5.15 to the left, improve= 7.47, (0 missing)
## company splits as LRLLRR, improve= 4.77, (0 missing)
## Surrogate splits:
## popularity < 7.19 to the right, agree=0.637, adj=0.258, (0 split)
## vote < 97.5 to the right, agree=0.633, adj=0.250, (0 split)
## score < 5.95 to the left, agree=0.616, adj=0.214, (0 split)
## genres splits as LLLRRRRLLRRLLRLLRR, agree=0.594, adj=0.170, (0 split)
## company splits as RLLLLL, agree=0.591, adj=0.165, (0 split)
##
## Node number 3: 1432 observations
## predicted class=1 expected loss=0.123 P(node) =0.658
## class counts: 176 1256
## probabilities: 0.123 0.877
##
## Node number 4: 364 observations, complexity param=0.0266
## predicted class=0 expected loss=0.39 P(node) =0.167
## class counts: 222 142
## probabilities: 0.610 0.390
## left son=8 (113 obs) right son=9 (251 obs)
## Primary splits:
## vote < 91.5 to the left, improve=21.70, (0 missing)
## popularity < 5.77 to the left, improve=17.10, (0 missing)
## score < 5.05 to the left, improve= 4.26, (0 missing)
## genres splits as LRRRRLLLLRLRRLLRLL, improve= 3.43, (0 missing)
## budget < 37500000 to the right, improve= 2.85, (0 missing)
## Surrogate splits:
## popularity < 6.07 to the left, agree=0.882, adj=0.619, (0 split)
## runtime < 153 to the right, agree=0.698, adj=0.027, (0 split)
## genres splits as RRRRRLRRRRRRRRRRLR, agree=0.695, adj=0.018, (0 split)
##
## Node number 5: 380 observations, complexity param=0.0202
## predicted class=1 expected loss=0.339 P(node) =0.175
## class counts: 129 251
## probabilities: 0.339 0.661
## left son=10 (255 obs) right son=11 (125 obs)
## Primary splits:
## company splits as LRRRRR, improve=12.00, (0 missing)
## vote < 45.5 to the left, improve=10.40, (0 missing)
## genres splits as RRLLLLLRRLRRLRRLRR, improve= 8.83, (0 missing)
## popularity < 7.88 to the left, improve= 7.62, (0 missing)
## score < 6.85 to the left, improve= 3.69, (0 missing)
## Surrogate splits:
## budget < 9120000 to the left, agree=0.695, adj=0.072, (0 split)
## genres splits as LLLLLLLLLLLLLLRLRL, agree=0.682, adj=0.032, (0 split)
## vote < 258 to the left, agree=0.676, adj=0.016, (0 split)
##
## Node number 8: 113 observations
## predicted class=0 expected loss=0.133 P(node) =0.0519
## class counts: 98 15
## probabilities: 0.867 0.133
##
## Node number 9: 251 observations, complexity param=0.0266
## predicted class=1 expected loss=0.494 P(node) =0.115
## class counts: 124 127
## probabilities: 0.494 0.506
## left son=18 (109 obs) right son=19 (142 obs)
## Primary splits:
## budget < 32500000 to the right, improve=5.61, (0 missing)
## genres splits as LRRRL-LRLRLLRLLL-L, improve=3.97, (0 missing)
## vote < 196 to the left, improve=3.28, (0 missing)
## popularity < 8.96 to the left, improve=3.05, (0 missing)
## runtime < 95.5 to the right, improve=2.67, (0 missing)
## Surrogate splits:
## genres splits as RLLRL-RRRRLRRRLR-R, agree=0.606, adj=0.092, (0 split)
## popularity < 12.3 to the right, agree=0.590, adj=0.055, (0 split)
## runtime < 108 to the right, agree=0.586, adj=0.046, (0 split)
## company splits as RLRRRR, agree=0.586, adj=0.046, (0 split)
## score < 5.35 to the left, agree=0.578, adj=0.028, (0 split)
##
## Node number 10: 255 observations, complexity param=0.0202
## predicted class=1 expected loss=0.427 P(node) =0.117
## class counts: 109 146
## probabilities: 0.427 0.573
## left son=20 (181 obs) right son=21 (74 obs)
## Primary splits:
## genres splits as RRLLLLLRRLRLLR-L-R, improve=9.30, (0 missing)
## vote < 53.5 to the left, improve=7.72, (0 missing)
## popularity < 7.7 to the left, improve=7.02, (0 missing)
## score < 6.85 to the left, improve=4.48, (0 missing)
## runtime < 170 to the left, improve=3.02, (0 missing)
##
## Node number 11: 125 observations
## predicted class=1 expected loss=0.16 P(node) =0.0574
## class counts: 20 105
## probabilities: 0.160 0.840
##
## Node number 18: 109 observations, complexity param=0.0114
## predicted class=0 expected loss=0.385 P(node) =0.0501
## class counts: 67 42
## probabilities: 0.615 0.385
## left son=36 (69 obs) right son=37 (40 obs)
## Primary splits:
## vote < 204 to the left, improve=4.55, (0 missing)
## popularity < 8.96 to the left, improve=3.96, (0 missing)
## genres splits as LLRLL-LLLRLLLRLL--, improve=3.02, (0 missing)
## season splits as LRLL, improve=2.97, (0 missing)
## runtime < 130 to the left, improve=2.75, (0 missing)
## Surrogate splits:
## popularity < 13 to the left, agree=0.734, adj=0.275, (0 split)
## budget < 35500000 to the right, agree=0.661, adj=0.075, (0 split)
## genres splits as LRLLR-LLLLLLLLLL--, agree=0.661, adj=0.075, (0 split)
## season splits as LLRL, agree=0.642, adj=0.025, (0 split)
##
## Node number 19: 142 observations
## predicted class=1 expected loss=0.401 P(node) =0.0653
## class counts: 57 85
## probabilities: 0.401 0.599
##
## Node number 20: 181 observations, complexity param=0.0202
## predicted class=0 expected loss=0.486 P(node) =0.0832
## class counts: 93 88
## probabilities: 0.514 0.486
## left son=40 (86 obs) right son=41 (95 obs)
## Primary splits:
## vote < 56 to the left, improve=9.72, (0 missing)
## popularity < 9.11 to the left, improve=5.86, (0 missing)
## score < 6.85 to the left, improve=5.69, (0 missing)
## runtime < 170 to the left, improve=4.42, (0 missing)
## budget < 876000 to the right, improve=3.29, (0 missing)
## Surrogate splits:
## popularity < 4.72 to the left, agree=0.901, adj=0.791, (0 split)
## score < 6.35 to the left, agree=0.608, adj=0.174, (0 split)
## runtime < 102 to the left, agree=0.575, adj=0.105, (0 split)
## budget < 3750000 to the left, agree=0.569, adj=0.093, (0 split)
## genres splits as --RRRLR--L-LR--R--, agree=0.569, adj=0.093, (0 split)
##
## Node number 21: 74 observations
## predicted class=1 expected loss=0.216 P(node) =0.034
## class counts: 16 58
## probabilities: 0.216 0.784
##
## Node number 36: 69 observations
## predicted class=0 expected loss=0.275 P(node) =0.0317
## class counts: 50 19
## probabilities: 0.725 0.275
##
## Node number 37: 40 observations
## predicted class=1 expected loss=0.425 P(node) =0.0184
## class counts: 17 23
## probabilities: 0.425 0.575
##
## Node number 40: 86 observations
## predicted class=0 expected loss=0.314 P(node) =0.0395
## class counts: 59 27
## probabilities: 0.686 0.314
##
## Node number 41: 95 observations
## predicted class=1 expected loss=0.358 P(node) =0.0437
## class counts: 34 61
## probabilities: 0.358 0.642
prf_dt$variable.importance#variable importance
## vote popularity budget genres score company
## 162.149 127.432 48.045 19.347 19.210 16.739
## runtime season
## 3.613 0.114
#printcp(prf_dt) # display the results
We can visualize our tree
# plot tree
prp(prf_dt, main = "Classification Trees for Profit Status")
#rpart.plot(prf_dt,main = "Classification Trees for Profit Status")
fancyRpartPlot(prf_dt)
We can optimize our tree by pruning (in complex model, pruning helps to reduce overfitting).
printcp(prf_dt) # display the results
##
## Classification tree:
## rpart(formula = y ~ ., data = train3, method = "class")
##
## Variables actually used in tree construction:
## [1] budget company genres vote
##
## Root node error: 527/2176 = 0.2
##
## n= 2176
##
## CP nsplit rel error xerror xstd
## 1 0.08 0 1.0 1.0 0.04
## 2 0.03 2 0.8 0.9 0.04
## 3 0.02 4 0.8 0.9 0.04
## 4 0.01 7 0.7 0.8 0.04
## 5 0.01 8 0.7 0.8 0.04
plotcp(prf_dt) # visualize cross-validation results
Relative error is lowest at CP = 5, numbers of split = 8.
Let’s see our tree after pruning.
prf_dt_prune <- prune(prf_dt, cp = prf_dt$cptable[which.min(prf_dt$cptable[,"xerror"]),"CP"]) # prune trees at cp where xerror is minimum
#prf_dt_prune <- prune(prf_dt, cp = prf_dt$cptable[4,"CP"])
# plot tree
prp(prf_dt_prune, main = "Classification Trees for Profit Status")
#rpart.plot(prf_dt_prune,main = "Classification Trees for Profit Status")
fancyRpartPlot(prf_dt_prune, main = "Classification Trees for Profit Status")
The last split was pruned.
prf_dt.pred <- predict(prf_dt_prune, test3, type = "class" ) # predict the model on the testing data
table_dt <- table(actual = test3$y, predicted = prf_dt.pred) # create a confusion matrix of predicted and actual values
table_dt
## predicted
## actual 0 1
## 0 99 161
## 1 44 745
# Accuracy Calculation
sum(diag(table_dt))/sum(table_dt) # sum(true positive + false negative)/ total
## [1] 0.805
confusionMatrix(reference = test3$y, data = prf_dt.pred)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 99 44
## 1 161 745
##
## Accuracy : 0.805
## 95% CI : (0.779, 0.828)
## No Information Rate : 0.752
## P-Value [Acc > NIR] : 3.26e-05
##
## Kappa : 0.383
##
## Mcnemar's Test P-Value : 5.42e-16
##
## Sensitivity : 0.3808
## Specificity : 0.9442
## Pos Pred Value : 0.6923
## Neg Pred Value : 0.8223
## Prevalence : 0.2479
## Detection Rate : 0.0944
## Detection Prevalence : 0.1363
## Balanced Accuracy : 0.6625
##
## 'Positive' Class : 0
##
#length(prf_dt.pred)
# I see the method to plot ROC for classification tree in youtube and stackoverflow
# There is an option: type = "prob" when predict rpart object, it return predicted probabilities by the classification tree
# I then realize that the values with predicted probabilites > 0.5 are the values predicted as 1 in the above method (chunk Q92)
# I create the another confusion matrix for this part and set threshold = 0.5 and see that the confusion matrices are also similar
# prediction using type = "prob"
prf_dt.pred1 <- predict(prf_dt_prune, test3, type = "prob" )
#prf_dt.conf <- confusionMatrix(data = as.factor(as.integer(prf_dt.pred1[,2]>0.5)), reference = test3$y)
# the accuracy will be the same as the accuracy we calculate when setting type=class, we can see the results to check it
#prf_dt.conf # the accuracy is also 80.5% and the confusion matrix is the same too
#plot ROC and see AUC
h_dt <- roc(test3$y ~ prf_dt.pred1[,2])
auc(h_dt)
## Area under the curve: 0.764
plot(h_dt)
The area under the curve is 0.764 (smaller than 0.8). Classification tree seems not be as good as Logistic Regression in this case.
chooseK = function(k, train_set, val_set, train_class, val_class){
# Build knn with k neighbors considered.
class_knn = knn(train = train_set, #<- training set cases
test = val_set, #<- test set cases
cl = train_class, #<- category for classification
k = k, #<- number of neighbors considered
use.all = TRUE) #<- control ties between class assignments
# If true, all distances equal to the kth
# largest are included
tab = table(class_knn, val_class)
# Calculate the accuracy.
accu = sum(tab[row(tab) == col(tab)]) / sum(tab)
cbind(k = k, accuracy = accu)
}
loadPkg("class")
#test3
#train3
loadPkg("gmodels")
knn_profit <- knn(train = train3[c(1:5)], test = test3[c(1:5)], cl=train3$y, k=3)
CrossTable( knn_profit,test3$y, prop.chisq = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 1049
##
##
## | test3$y
## knn_profit | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 92 | 85 | 177 |
## | 0.520 | 0.480 | 0.169 |
## | 0.354 | 0.108 | |
## | 0.088 | 0.081 | |
## -------------|-----------|-----------|-----------|
## 1 | 168 | 704 | 872 |
## | 0.193 | 0.807 | 0.831 |
## | 0.646 | 0.892 | |
## | 0.160 | 0.671 | |
## -------------|-----------|-----------|-----------|
## Column Total | 260 | 789 | 1049 |
## | 0.248 | 0.752 | |
## -------------|-----------|-----------|-----------|
##
##
knn_k_profit = sapply(seq(1, 21, by = 2), #<- set k to be odd number from 1 to 21
function(x) chooseK(x,
train_set = train3[c(1:5)],
val_set = test3[c(1:5)],
train_class = train3$y,
val_class = test3$y))
# Reformat the results to graph the results.
#str(knn_different_k)
knn_k_profit = data.frame(k = knn_k_profit[1,],
accuracy = knn_k_profit[2,])
# Plot accuracy vs. k.
ggplot(knn_k_profit,
aes(x = k, y = accuracy)) +
geom_line(color = "orange", size = 1.5) +
geom_point(size = 3) +
theme_bw()
#theme(plot.title = element_text(hjust=0.5, size = 15, face = "bold.italic"))
knn_profit <- knn(train = train3[c(1:5)], test = test3[c(1:5)], cl=train3$y, k=7)
CrossTable( knn_profit,test3$y, prop.chisq = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 1049
##
##
## | test3$y
## knn_profit | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 90 | 66 | 156 |
## | 0.577 | 0.423 | 0.149 |
## | 0.346 | 0.084 | |
## | 0.086 | 0.063 | |
## -------------|-----------|-----------|-----------|
## 1 | 170 | 723 | 893 |
## | 0.190 | 0.810 | 0.851 |
## | 0.654 | 0.916 | |
## | 0.162 | 0.689 | |
## -------------|-----------|-----------|-----------|
## Column Total | 260 | 789 | 1049 |
## | 0.248 | 0.752 | |
## -------------|-----------|-----------|-----------|
##
##
confusionMatrix(data=test3$y, reference = knn_profit)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 90 170
## 1 66 723
##
## Accuracy : 0.775
## 95% CI : (0.749, 0.8)
## No Information Rate : 0.851
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.303
##
## Mcnemar's Test P-Value : 2.02e-11
##
## Sensitivity : 0.5769
## Specificity : 0.8096
## Pos Pred Value : 0.3462
## Neg Pred Value : 0.9163
## Prevalence : 0.1487
## Detection Rate : 0.0858
## Detection Prevalence : 0.2479
## Balanced Accuracy : 0.6933
##
## 'Positive' Class : 0
##
KNN model is the worst among three models: Logit, Classification Tree and KNN.
P/s: We can try with Random Forest Model as well and it would have the best results, but our project is too long so I do not cover RF here.
In the prediciton of Profitability, Logit Regression has the best prediciton in our case.
We will use KNN model in this part to try for some predictions. (P/s: We will discuss later to see if we should include it in the final summary).
#loadPkg("class")
# we use the same train and test sets as in the linear model section
#train2 <- as.data.frame(scale(movie_num, center = TRUE, scale = TRUE))
#train2$genres <- movie[movie_sample==1, 1:14]$genres
#train set
train2 <- train1_full # dupicate the train set with all predictors we created above
train2$revenue <- scale(train2$revenue, center = TRUE, scale = TRUE) # scale the revenue column, other numerical variables were scaled before
# numerical colums range from 1:6; 7:9 are our reponses
# same with test set
test2 <- test1_full
test2$revenue <- scale(test2$revenue, center = TRUE, scale = TRUE)
str(train2)
## 'data.frame': 2176 obs. of 9 variables:
## $ revenue : num [1:2176, 1] 14.191 4.046 0.873 4.1 6.837 ...
## ..- attr(*, "scaled:center")= num 1.2e+08
## ..- attr(*, "scaled:scale")= num 1.88e+08
## $ budget : num 4.42 4.6 4.94 4.89 5.39 ...
## $ popularity: num 3.355 2.165 0.411 2.395 2.908 ...
## $ runtime : num 2.44 1.78 1.01 1.35 1.44 ...
## $ score : num 1.031 -0.0157 -0.2483 -0.4809 1.1474 ...
## $ vote : num 7.65 2.47 0.81 1.84 4.09 ...
## $ genres : Factor w/ 18 levels "Action","Adventure",..: 1 1 1 9 1 2 1 2 2 2 ...
## $ company : Factor w/ 6 levels "Others","Paramount Pictures",..: 1 3 5 3 5 6 6 6 5 5 ...
## $ season : Factor w/ 4 levels "Spring","Summer",..: 4 3 1 1 1 2 1 2 2 1 ...
str(test2)
## 'data.frame': 1049 obs. of 9 variables:
## $ revenue : num [1:1049, 1] 4.573 5.25 2.555 2.524 -0.191 ...
## ..- attr(*, "scaled:center")= num 1.24e+08
## ..- attr(*, "scaled:scale")= num 1.83e+08
## $ budget : num 5.84 4.71 4.94 3.59 4.83 ...
## $ popularity: num 3.041 2.301 0.542 2.18 0.552 ...
## $ runtime : num 2.779 2.588 -0.511 -0.225 1.825 ...
## $ score : num 0.682 1.496 1.264 -0.248 -0.481 ...
## $ vote : num 2.489 5.745 1.662 1.404 0.942 ...
## $ genres : Factor w/ 18 levels "Action","Adventure",..: 2 1 3 2 1 1 1 7 16 2 ...
## $ company : Factor w/ 6 levels "Others","Paramount Pictures",..: 5 1 5 1 5 1 1 2 4 1 ...
## $ season : Factor w/ 4 levels "Spring","Summer",..: 1 2 3 3 2 2 1 3 1 1 ...
set.seed(123) # set seed to make sure the table won't change after we re-run the code
season_knn <- knn(train = train2[,c(1:6)], test = test2[,c(1:6)], cl=train2$season, k=9)
crosstab <- CrossTable(test2$season, season_knn, prop.chisq = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 1049
##
##
## | season_knn
## test2$season | Spring | Summer | Fall | Winter | Row Total |
## -------------|-----------|-----------|-----------|-----------|-----------|
## Spring | 40 | 63 | 64 | 47 | 214 |
## | 0.187 | 0.294 | 0.299 | 0.220 | 0.204 |
## | 0.189 | 0.243 | 0.190 | 0.195 | |
## | 0.038 | 0.060 | 0.061 | 0.045 | |
## -------------|-----------|-----------|-----------|-----------|-----------|
## Summer | 75 | 84 | 63 | 59 | 281 |
## | 0.267 | 0.299 | 0.224 | 0.210 | 0.268 |
## | 0.354 | 0.324 | 0.187 | 0.245 | |
## | 0.071 | 0.080 | 0.060 | 0.056 | |
## -------------|-----------|-----------|-----------|-----------|-----------|
## Fall | 49 | 72 | 125 | 81 | 327 |
## | 0.150 | 0.220 | 0.382 | 0.248 | 0.312 |
## | 0.231 | 0.278 | 0.371 | 0.336 | |
## | 0.047 | 0.069 | 0.119 | 0.077 | |
## -------------|-----------|-----------|-----------|-----------|-----------|
## Winter | 48 | 40 | 85 | 54 | 227 |
## | 0.211 | 0.176 | 0.374 | 0.238 | 0.216 |
## | 0.226 | 0.154 | 0.252 | 0.224 | |
## | 0.046 | 0.038 | 0.081 | 0.051 | |
## -------------|-----------|-----------|-----------|-----------|-----------|
## Column Total | 212 | 259 | 337 | 241 | 1049 |
## | 0.202 | 0.247 | 0.321 | 0.230 | |
## -------------|-----------|-----------|-----------|-----------|-----------|
##
##
set.seed(123)
knn_season_k = sapply(seq(1, 41, by = 2), #<- set k to be odd number from 1 to 21
function(x) chooseK(x,
train_set = train2[c(1:6)],
val_set = test2[c(1:6)],
train_class = train2$season,
val_class = test2$season))
knn_season_k = data.frame(k = knn_season_k[1,],
accuracy = knn_season_k[2,])
# Plot accuracy vs. k.
ggplot(knn_season_k,
aes(x = k, y = accuracy)) +
geom_line(color = "orange", size = 1.5) +
geom_point(size = 3)
k= 35 gives the best accuracy
There are many genres so we won’t show the cross table
set.seed(123)
knn_genres_k = sapply(seq(1, 41, by = 2), #<- set k to be odd number from 1 to 21
function(x) chooseK(x,
train_set = train2[c(1:6)],
val_set = test2[c(1:6)],
train_class = train2$genres,
val_class = test2$genres))
knn_genres_k = data.frame(k = knn_genres_k[1,],
accuracy = knn_genres_k[2,])
# Plot accuracy vs. k.
ggplot(knn_genres_k,
aes(x = k, y = accuracy)) +
geom_line(color = "orange", size = 1.5) +
geom_point(size = 3)
k = 27 gives the best accuracy
set.seed(123)
company_knn <- knn(train = train2[,c(1:6)], test = test2[,c(1:6)], cl=train2$company, k=9)
crosstab <- CrossTable(test2$company, company_knn, prop.chisq = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 1049
##
##
## | company_knn
## test2$company | Others | Paramount Pictures | Sony Pictures | Universal Pictures | Walt Disney | Warner Bros | Row Total |
## -------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
## Others | 447 | 10 | 10 | 17 | 48 | 2 | 534 |
## | 0.837 | 0.019 | 0.019 | 0.032 | 0.090 | 0.004 | 0.509 |
## | 0.548 | 0.370 | 0.312 | 0.395 | 0.381 | 0.400 | |
## | 0.426 | 0.010 | 0.010 | 0.016 | 0.046 | 0.002 | |
## -------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
## Paramount Pictures | 67 | 2 | 1 | 2 | 14 | 0 | 86 |
## | 0.779 | 0.023 | 0.012 | 0.023 | 0.163 | 0.000 | 0.082 |
## | 0.082 | 0.074 | 0.031 | 0.047 | 0.111 | 0.000 | |
## | 0.064 | 0.002 | 0.001 | 0.002 | 0.013 | 0.000 | |
## -------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
## Sony Pictures | 62 | 5 | 3 | 9 | 9 | 0 | 88 |
## | 0.705 | 0.057 | 0.034 | 0.102 | 0.102 | 0.000 | 0.084 |
## | 0.076 | 0.185 | 0.094 | 0.209 | 0.071 | 0.000 | |
## | 0.059 | 0.005 | 0.003 | 0.009 | 0.009 | 0.000 | |
## -------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
## Universal Pictures | 81 | 3 | 9 | 7 | 17 | 0 | 117 |
## | 0.692 | 0.026 | 0.077 | 0.060 | 0.145 | 0.000 | 0.112 |
## | 0.099 | 0.111 | 0.281 | 0.163 | 0.135 | 0.000 | |
## | 0.077 | 0.003 | 0.009 | 0.007 | 0.016 | 0.000 | |
## -------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
## Walt Disney | 109 | 7 | 7 | 7 | 27 | 2 | 159 |
## | 0.686 | 0.044 | 0.044 | 0.044 | 0.170 | 0.013 | 0.152 |
## | 0.134 | 0.259 | 0.219 | 0.163 | 0.214 | 0.400 | |
## | 0.104 | 0.007 | 0.007 | 0.007 | 0.026 | 0.002 | |
## -------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
## Warner Bros | 50 | 0 | 2 | 1 | 11 | 1 | 65 |
## | 0.769 | 0.000 | 0.031 | 0.015 | 0.169 | 0.015 | 0.062 |
## | 0.061 | 0.000 | 0.062 | 0.023 | 0.087 | 0.200 | |
## | 0.048 | 0.000 | 0.002 | 0.001 | 0.010 | 0.001 | |
## -------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
## Column Total | 816 | 27 | 32 | 43 | 126 | 5 | 1049 |
## | 0.778 | 0.026 | 0.031 | 0.041 | 0.120 | 0.005 | |
## -------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
##
##
set.seed(123)
knn_company_k = sapply(seq(1, 41, by = 2), #<- set k to be odd number from 1 to 21
function(x) chooseK(x,
train_set = train2[c(1:6)],
val_set = test2[c(1:6)],
train_class = train2$company,
val_class = test2$company))
knn_company_k = data.frame(k = knn_company_k[1,],
accuracy = knn_company_k[2,])
# Plot accuracy vs. k.
ggplot(knn_company_k,
aes(x = k, y = accuracy)) +
geom_line(color = "orange", size = 1.5) +
geom_point(size = 3)
k = 23 gives the best accuracy
We use time series model in this part. We will try HoltWinters and ARIMA.
# we will subset our data to get the movies from 1995-2016 since we have consistent data in that range
# we use time series with frequency = 4 (by quarter)
movie_ts <- subset(movie, (year > 1994)) # subset data after 1994
movie_ts <- subset(movie_ts, select = c("revenue", "year", "quarter")) # select 3 columns we use for time series
movie_ts <- movie_ts[order(movie_ts$year),] # order by year
#unique(movie_ts$year)
#str(movie_ts)
loadPkg("dplyr")
movie_ts = movie_ts %>% group_by(year, quarter) %>% summarise_each(funs(sum)) # calculate the total revenue for each quarter in each year
movie_ts$quarter <- factor(movie_ts$quarter, levels = c("Q1", "Q2", "Q3", "Q4")) # reorder the factor levels
movie_ts <- movie_ts[order(movie_ts$year, movie_ts$quarter),] # order by year then by quarter
#movie_ts
# divide the data into 2 sets: training and testing
movie.ts <- ts(movie_ts$revenue, frequency = 4, start = c(1995,1), end = c(2010,4)) # training data from 1995-2010
movie.ts.test <- ts(tail(movie_ts, 23)$revenue, frequency = 4, start = c(2011,1), end = c(2016,3)) # testing data from 2011-2016, used for evaluating our time series model
#movie.ts.test
ggplot(movie_ts, aes(x=quarter, y=revenue, fill = quarter)) +
ggtitle("Revenue distribution by quarter") +
geom_boxplot(fill = c("red","dodgerblue","yellow","green"),alpha = 0.8) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))
Movies in quarter 2 seems to perform well in box office revenue.
plot(movie.ts, xlab = "Year", ylab="Revenue")
movie_dcp <- decompose(movie.ts)
summary(movie_dcp$seasonal)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.55e+09 -6.35e+08 2.92e+08 0.00e+00 9.27e+08 9.67e+08
summary(movie_dcp$random)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -1.05e+09 -4.67e+08 -8.91e+07 -3.50e+06 4.29e+08 1.99e+09 4
plot(movie_dcp)
movie.ts.smth <- HoltWinters(movie.ts)
movie.ts.smth
## Holt-Winters exponential smoothing with trend and additive seasonal component.
##
## Call:
## HoltWinters(x = movie.ts)
##
## Smoothing parameters:
## alpha: 0.0475
## beta : 0.154
## gamma: 0.313
##
## Coefficients:
## [,1]
## a 4.70e+09
## b 5.96e+07
## s1 -1.15e+09
## s2 1.73e+09
## s3 3.53e+08
## s4 1.03e+09
movie.ts.smth$SSE
## [1] 4.17e+19
plot(movie.ts.smth)
Prediction on the future
loadPkg("forecast")
#accuracy(movie_hw)
movie_ft <- forecast(movie.ts.smth, h=23)
plot(movie_ft)
#sm <- ma(movie.ts, order=4) # 4 quarters moving average
#lines(sm, col="red") # plot
We predict the model on the testing data
plot(movie_ft)
lines(movie.ts.test, col = 'red')
#ggplot(movie_hw)
legend("topleft",lty=1,bty = "n",col=c("red","blue"),c("actual","predicted"))
Better visualization with highcharter
loadPkg("highcharter")
hchart(movie_ft) %>%
hc_add_series_ts(movie.ts.test, color = "red", name = "ACTUAL") # plot the predicted and actual values in testing data, and the values in training data
# we can have a more detailed plot with only predicted and actual values in testing data
highchart(type="stock") %>% # try type = "chart", "stock" for different visualizations
hc_chart(type = "line") %>% # try type = "line", bar", "column" for different visualizations
hc_add_series_ts(movie.ts.test,name = "ACTUAL", color = "red" ) %>% # plot the actual data labeled by red color
hc_add_series(name = "PREDICTED", movie_ft, color = "blue" ) %>% # plot the hw predictions labeled by blue color
hc_legend(align = "left", verticalAlign = "top",
layout = "vertical", x = 0, y = 100)
Nice prediction since the actual values are all in the 95% confidence interval.
movie_arima <- auto.arima(movie.ts) # create arima model
movie_arima <- forecast(movie_arima, h=23) # see how it forcast
plot(movie_arima)
#movie_arima
plot(movie_arima)
lines(movie.ts.test, col = 'red')
legend("topleft",lty=1,bty = "n",col=c("red","blue"),c("actual","predicted"))
We can also plot nicer with highcharter
hchart(movie_arima) %>%
hc_add_series_ts(movie.ts.test, color = "red", name = "ACTUAL") # general graph
# more detailed graph, I try different stuffs here
highchart(type="chart") %>% # try type = "chart", "stock" for different visualizations
hc_chart(type = "column") %>% # try type = "line", bar", "column" for different visualizations
hc_add_series_ts(movie.ts.test,name = "ACTUAL", color = "red" ) %>% # plot the actual data labeled by red color
hc_add_series(name = "PREDICTED", movie_arima, color = "blue" ) %>% # plot the hw predictions labeled by blue color
hc_legend(align = "left", verticalAlign = "top",
layout = "vertical", x = 0, y = 100)
In ARIMA model, some actual values are out of 95% confidence interval.
accuracy(movie_ft, movie.ts.test)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 1.09e+08 8.33e+08 6.51e+08 -1.23 22.9 0.812 -0.156
## Test set -3.03e+08 1.32e+09 9.99e+08 -13.25 22.2 1.245 -0.231
## Theil's U
## Training set NA
## Test set 0.35
accuracy(movie_arima, movie.ts.test)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 2.25e+07 7.06e+08 5.23e+08 -4.1 17.7 0.652 -0.0119
## Test set -3.20e+08 1.31e+09 9.88e+08 -13.7 22.2 1.232 -0.2368
## Theil's U
## Training set NA
## Test set 0.352
ARIMA does better on training set but the performances of two models on testing set are not much difference. Imo, HoltWinters is better as the actual values are always in the 95 confidence interval. The difference between testing and training sets are lower as well. ARIMA may result in overfitting (low variance on training set but in testing set we have high bias).